コード例 #1
0
def main():
    import FileCabinet
    import NormalizeVolume
    import sys, os
    from multiprocessing import Pool
    import MultiNormalizeProcess
    args = sys.argv

    inputfolder = args[1]
    outputfolder = args[2]

    if not os.path.isdir(inputfolder):
        print("Input folder " + inputfolder + " is not a directory.")
        sys.exit(0)

    if not os.path.isdir(outputfolder):
        print("Output folder " + outputfolder + " is not a directory.")
        sys.exit(0)

    infiles = os.listdir(inputfolder)

    already_converted = [x.replace('.tsv', '.txt') for x in os.listdir(outputfolder) if x.endswith('.tsv')]

    not_yet_converted = set(infiles) - set(already_converted)

    print("There are " + str(len(not_yet_converted)) + " files still to convert.")
    inpaths = [os.path.join(inputfolder, x) for x in not_yet_converted if x.endswith('.txt')]

    outpaths = [os.path.join(outputfolder, x).replace('.txt', '.tsv') for x in not_yet_converted if x.endswith('.txt')]

    debug = False

    pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

    datapath = pathdictionary['datapath']
    metadatapath = pathdictionary['metadatapath']
    metaoutpath = pathdictionary['metaoutpath']
    outpath = pathdictionary['outpath']

    pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths)))))

    pool = Pool(processes = 12)
    res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs)
    res.wait()
    resultlist = res.get()

    pool.close()
    pool.join()

    os.system('say "your program has finished"')
コード例 #2
0
ファイル: Context.py プロジェクト: Anterotesis/DataMunging
## CONTEXT.py

'''Contextual spellchecker. On being imported, it loads rulesets.
   The function as_stream reduces a file to a tokenstream and tests
   to see whether this is a long-s file needing correction. (Ideally
   you should only run it on pre-1830 files that might fall into that
   category.)
   Then the function "catch_ambiguities" can be called for a specific file.
'''

# IMPORTS.

import FileCabinet

pathdictionary = FileCabinet.loadpathdictionary()
rulepath = pathdictionary['contextrulepath']

# CONSTANTS.

delim = '\t'
punctuationset = {'.', ',', '?', '!', ';', ')'}
# There's a reason why we don't include left paren. See 'catch_ambiguities.'

flipslipper = ['flip', 'flips', 'flipped', 'flipping', 'flay', 'flays', 'flayed', "flay'd"]
# The triadic problems flip - slip - ship and flay - slay - stay require special treatment.                                          ')

felecterrors = ['fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft']
selecttruths = ['see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost']
# Of course, either set could be valid. But I expect the second to be more common.
# The comparison is used as a test.
コード例 #3
0
breakselected = str.maketrans(BreakablePunctuation, '       ')

## Translation map that erases most punctuation, including hyphens.
Punctuation = '.,():-—;"!?•$%@“”#<>+=/[]*^\'{}_■~\\|«»©&~`£·'
mosteraser = str.maketrans('', '', Punctuation)

punctuple = ('.', ',', '?', '!', ';')

delim = '\t'
foundcounter = 0
englishcounter = 0
pagedict = dict()

import FileCabinet

pathdictionary = FileCabinet.loadpathdictionary()

rulepath = pathdictionary['volumerulepath']

romannumerals = set()
with open(rulepath + 'romannumerals.txt', encoding='utf-8') as file:
    filelines = file.readlines()

for line in filelines:
    line = line.rstrip()
    romannumerals.add(line)

lexicon = dict()

with open(rulepath + 'MainDictionary.txt', encoding='utf-8') as file:
    filelines = file.readlines()
コード例 #4
0
## We assume the slice name has been passed in as an argument.
slicename = sys.argv[1]
current_working = os.getcwd()

# This is most important when running on the cluster, where files are stored in a pairtree
# structure and the only way to know which files we're processing is to list HTIDS in a
# "slice" file defining a slice of the collection.

# When we're running on a local machine, I usually just group files to be processed in a
# directory, and create a list of files to process by listing files in that directory.
# However, it's still necessary to have a slicename and slicepath, because these get
# used to generate a path for an errorlog and list of long S files.

if not testrun:
	pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/normalize/PathDictionary.txt')
if testrun:
	pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

datapath = pathdictionary['datapath']
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['metaoutpath']
outpath = pathdictionary['outpath']
# only relevant if testrun == True

slicepath = pathdictionary['slicepath'] + slicename + '.txt'
errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt'
longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt'
headeroutpath = pathdictionary['slicepath'] + slicename + "headers.txt"

# read in special-purpose london phrase list
コード例 #5
0
## We assume the slice name has been passed in as an argument.
slicename = sys.argv[1]
current_working = os.getcwd()

# This is most important when running on the cluster, where files are stored in a pairtree
# structure and the only way to know which files we're processing is to list HTIDS in a
# "slice" file defining a slice of the collection.

# When we're running on a local machine, I usually just group files to be processed in a
# directory, and create a list of files to process by listing files in that directory.
# However, it's still necessary to have a slicename and slicepath, because these get
# used to generate a path for an errorlog and list of long S files.

if not testrun:
	pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/normalize/PathDictionary.txt')
if testrun:
	pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

datapath = pathdictionary['datapath']
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['metaoutpath']
outpath = pathdictionary['outpath']
# only relevant if testrun == True

slicepath = pathdictionary['slicepath'] + slicename + '.txt'
errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt'
longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt'
headeroutpath = pathdictionary['slicepath'] + slicename + "headers.txt"

# read in special-purpose london phrase list
コード例 #6
0
# USAGE:
# from within this /workflow directory:
# python NormalizeOneFile.py file_to_crunch.txt > output.tsv

# The paths in NormalizeVolume only work if you do it from
# within this directory.

import FileCabinet
import NormalizeVolume
import sys

debug = False

pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

datapath = pathdictionary['datapath']
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['metaoutpath']
outpath = pathdictionary['outpath']

targetfile = sys.argv[1]

with open(targetfile, encoding='utf-8') as f:
    text = f.readlines()

tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=debug)

correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

pagecounter = 0
masterdict = dict()
コード例 #7
0
ファイル: Tokenize.py プロジェクト: Borel1/GenreProject
import FileCabinet
import Volume
import Context
import sys

# DEFINE CONSTANTS.
delim = '\t'
debug = False

# LOAD PATHS.
slicename = sys.argv[1]
outfilename = sys.argv[2]

## We assume the slice name has been passed in as an argument.

pathdictionary = FileCabinet.loadpathdictionary(
    '/home/tunder/python/tokenize/PathDictionary.txt')

datapath = pathdictionary['datapath']
slicepath = pathdictionary['slicepath'] + slicename + '.txt'
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt'
errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt'
longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt'

with open(slicepath, encoding="utf-8") as file:
    HTIDlist = file.readlines()

HTIDs = set()

for thisID in HTIDlist:
    thisID = thisID.rstrip()
コード例 #8
0
ファイル: Tokenize.py プロジェクト: Borel1/GenreProject
import FileCabinet
import Volume
import Context
import sys

# DEFINE CONSTANTS.
delim = '\t'
debug = False

# LOAD PATHS.
slicename = sys.argv[1]
outfilename = sys.argv[2]

## We assume the slice name has been passed in as an argument.

pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/tokenize/PathDictionary.txt')

datapath = pathdictionary['datapath']
slicepath = pathdictionary['slicepath'] + slicename + '.txt'
metadatapath = pathdictionary['metadatapath']
metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt'
errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt'
longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt'
       
with open(slicepath, encoding="utf-8") as file:
    HTIDlist = file.readlines()

HTIDs = set()

for thisID in HTIDlist:
    thisID = thisID.rstrip()
コード例 #9
0
def main():
    import FileCabinet
    import NormalizeVolume
    import sys, os
    from multiprocessing import Pool
    import MultiNormalizeProcess
    args = sys.argv

    inputfolder = args[1]
    outputfolder = args[2]

    if not os.path.isdir(inputfolder):
        print("Input folder " + inputfolder + " is not a directory.")
        sys.exit(0)

    if not os.path.isdir(outputfolder):
        print("Output folder " + outputfolder + " is not a directory.")
        sys.exit(0)

    infiles = os.listdir(inputfolder)

    already_converted = [
        x.replace('.tsv', '.txt') for x in os.listdir(outputfolder)
        if x.endswith('.tsv')
    ]

    not_yet_converted = set(infiles) - set(already_converted)

    print("There are " + str(len(not_yet_converted)) +
          " files still to convert.")
    inpaths = [
        os.path.join(inputfolder, x) for x in not_yet_converted
        if x.endswith('.txt')
    ]

    outpaths = [
        os.path.join(outputfolder, x).replace('.txt', '.tsv')
        for x in not_yet_converted if x.endswith('.txt')
    ]

    debug = False

    pathdictionary = FileCabinet.loadpathdictionary(
        '/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt')

    datapath = pathdictionary['datapath']
    metadatapath = pathdictionary['metadatapath']
    metaoutpath = pathdictionary['metaoutpath']
    outpath = pathdictionary['outpath']

    pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths)))))

    pool = Pool(processes=12)
    res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs)
    res.wait()
    resultlist = res.get()

    pool.close()
    pool.join()

    os.system('say "your program has finished"')