def main(): import FileCabinet import NormalizeVolume import sys, os from multiprocessing import Pool import MultiNormalizeProcess args = sys.argv inputfolder = args[1] outputfolder = args[2] if not os.path.isdir(inputfolder): print("Input folder " + inputfolder + " is not a directory.") sys.exit(0) if not os.path.isdir(outputfolder): print("Output folder " + outputfolder + " is not a directory.") sys.exit(0) infiles = os.listdir(inputfolder) already_converted = [x.replace('.tsv', '.txt') for x in os.listdir(outputfolder) if x.endswith('.tsv')] not_yet_converted = set(infiles) - set(already_converted) print("There are " + str(len(not_yet_converted)) + " files still to convert.") inpaths = [os.path.join(inputfolder, x) for x in not_yet_converted if x.endswith('.txt')] outpaths = [os.path.join(outputfolder, x).replace('.txt', '.tsv') for x in not_yet_converted if x.endswith('.txt')] debug = False pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths))))) pool = Pool(processes = 12) res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs) res.wait() resultlist = res.get() pool.close() pool.join() os.system('say "your program has finished"')
## CONTEXT.py '''Contextual spellchecker. On being imported, it loads rulesets. The function as_stream reduces a file to a tokenstream and tests to see whether this is a long-s file needing correction. (Ideally you should only run it on pre-1830 files that might fall into that category.) Then the function "catch_ambiguities" can be called for a specific file. ''' # IMPORTS. import FileCabinet pathdictionary = FileCabinet.loadpathdictionary() rulepath = pathdictionary['contextrulepath'] # CONSTANTS. delim = '\t' punctuationset = {'.', ',', '?', '!', ';', ')'} # There's a reason why we don't include left paren. See 'catch_ambiguities.' flipslipper = ['flip', 'flips', 'flipped', 'flipping', 'flay', 'flays', 'flayed', "flay'd"] # The triadic problems flip - slip - ship and flay - slay - stay require special treatment. ') felecterrors = ['fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft'] selecttruths = ['see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost'] # Of course, either set could be valid. But I expect the second to be more common. # The comparison is used as a test.
breakselected = str.maketrans(BreakablePunctuation, ' ') ## Translation map that erases most punctuation, including hyphens. Punctuation = '.,():-—;"!?•$%@“”#<>+=/[]*^\'{}_■~\\|«»©&~`£·' mosteraser = str.maketrans('', '', Punctuation) punctuple = ('.', ',', '?', '!', ';') delim = '\t' foundcounter = 0 englishcounter = 0 pagedict = dict() import FileCabinet pathdictionary = FileCabinet.loadpathdictionary() rulepath = pathdictionary['volumerulepath'] romannumerals = set() with open(rulepath + 'romannumerals.txt', encoding='utf-8') as file: filelines = file.readlines() for line in filelines: line = line.rstrip() romannumerals.add(line) lexicon = dict() with open(rulepath + 'MainDictionary.txt', encoding='utf-8') as file: filelines = file.readlines()
## We assume the slice name has been passed in as an argument. slicename = sys.argv[1] current_working = os.getcwd() # This is most important when running on the cluster, where files are stored in a pairtree # structure and the only way to know which files we're processing is to list HTIDS in a # "slice" file defining a slice of the collection. # When we're running on a local machine, I usually just group files to be processed in a # directory, and create a list of files to process by listing files in that directory. # However, it's still necessary to have a slicename and slicepath, because these get # used to generate a path for an errorlog and list of long S files. if not testrun: pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/normalize/PathDictionary.txt') if testrun: pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] # only relevant if testrun == True slicepath = pathdictionary['slicepath'] + slicename + '.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' headeroutpath = pathdictionary['slicepath'] + slicename + "headers.txt" # read in special-purpose london phrase list
# USAGE: # from within this /workflow directory: # python NormalizeOneFile.py file_to_crunch.txt > output.tsv # The paths in NormalizeVolume only work if you do it from # within this directory. import FileCabinet import NormalizeVolume import sys debug = False pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] targetfile = sys.argv[1] with open(targetfile, encoding='utf-8') as f: text = f.readlines() tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=debug) correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) pagecounter = 0 masterdict = dict()
import FileCabinet import Volume import Context import sys # DEFINE CONSTANTS. delim = '\t' debug = False # LOAD PATHS. slicename = sys.argv[1] outfilename = sys.argv[2] ## We assume the slice name has been passed in as an argument. pathdictionary = FileCabinet.loadpathdictionary( '/home/tunder/python/tokenize/PathDictionary.txt') datapath = pathdictionary['datapath'] slicepath = pathdictionary['slicepath'] + slicename + '.txt' metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' with open(slicepath, encoding="utf-8") as file: HTIDlist = file.readlines() HTIDs = set() for thisID in HTIDlist: thisID = thisID.rstrip()
import FileCabinet import Volume import Context import sys # DEFINE CONSTANTS. delim = '\t' debug = False # LOAD PATHS. slicename = sys.argv[1] outfilename = sys.argv[2] ## We assume the slice name has been passed in as an argument. pathdictionary = FileCabinet.loadpathdictionary('/home/tunder/python/tokenize/PathDictionary.txt') datapath = pathdictionary['datapath'] slicepath = pathdictionary['slicepath'] + slicename + '.txt' metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['slicepath'] + slicename + 'acc.txt' errorpath = pathdictionary['slicepath'] + slicename + 'errorlog.txt' longSpath = pathdictionary['slicepath'] + slicename + 'longS.txt' with open(slicepath, encoding="utf-8") as file: HTIDlist = file.readlines() HTIDs = set() for thisID in HTIDlist: thisID = thisID.rstrip()
def main(): import FileCabinet import NormalizeVolume import sys, os from multiprocessing import Pool import MultiNormalizeProcess args = sys.argv inputfolder = args[1] outputfolder = args[2] if not os.path.isdir(inputfolder): print("Input folder " + inputfolder + " is not a directory.") sys.exit(0) if not os.path.isdir(outputfolder): print("Output folder " + outputfolder + " is not a directory.") sys.exit(0) infiles = os.listdir(inputfolder) already_converted = [ x.replace('.tsv', '.txt') for x in os.listdir(outputfolder) if x.endswith('.tsv') ] not_yet_converted = set(infiles) - set(already_converted) print("There are " + str(len(not_yet_converted)) + " files still to convert.") inpaths = [ os.path.join(inputfolder, x) for x in not_yet_converted if x.endswith('.txt') ] outpaths = [ os.path.join(outputfolder, x).replace('.txt', '.tsv') for x in not_yet_converted if x.endswith('.txt') ] debug = False pathdictionary = FileCabinet.loadpathdictionary( '/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] pathpairs = list(zip(inpaths, outpaths, list(range(len(inpaths))))) pool = Pool(processes=12) res = pool.map_async(MultiNormalizeProcess.processvolume, pathpairs) res.wait() resultlist = res.get() pool.close() pool.join() os.system('say "your program has finished"')