def processvolume(triplet): infile, outfile, integer = triplet print(integer) try: with open(infile, encoding='utf-8') as f: text = f.readlines() except: print("ERROR reading file " + infile) tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=False) correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = False) pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] with open(outfile, mode = 'w', encoding = 'utf-8') as f: for key, value in masterdict.items(): if not key.startswith('#'): f.write(key + '\t' + str(value) + '\n') print(str(integer) + ' complete.') return "null"
def processvolume(triplet): infile, outfile, integer = triplet print(integer) try: with open(infile, encoding='utf-8') as f: text = f.readlines() except: print("ERROR reading file " + infile) tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream( [text], verbose=False) correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream( tokens, verbose=False) pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] with open(outfile, mode='w', encoding='utf-8') as f: for key, value in masterdict.items(): if not key.startswith('#'): f.write(key + '\t' + str(value) + '\n') print(str(integer) + ' complete.') return "null"
def process_a_file(file_tuple): global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders thisID, metadata_evidence = file_tuple perfileerrorlog = list() return_dict = dict() return_dict["htid"] = thisID return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0") return_dict["errors"] = [] return_dict["phrasecounts"] = dict() if testrun: cleanID = clean_pairtree(thisID.replace("norm.txt", "")) else: cleanID = clean_pairtree(thisID) if not testrun: filepath, postfix = FileCabinet.pairtreepath(thisID, datapath) filename = filepath + postfix + '/' + postfix + ".zip" else: filename = datapath + thisID # ACTUALLY READ THE FILE. if filename.endswith('.zip'): pagelist, successflag = read_zip(filename) else: pagelist, successflag = read_txt(filename) if successflag == "missing file": print(thisID + " is missing.") perfileerrorlog.append(thisID + '\t' + "missing") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "pagination error": print(thisID + " has a pagination problem.") perfileerrorlog.append(thisID + '\t' + "paginationerror") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "unicode error": print(thisID + " can not be decoded by unicode.") perfileerrorlog.append(thisID + '\t' + "unicode error") return_dict["errors"] = perfileerrorlog return return_dict tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug) if pre_english < 0.6: perfileerrorlog.append(thisID + '\t' + "not english") tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) perfileerrorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens deleted = dict() added = dict() else: deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug) # okay, this is crazy and not efficient to run, but it's easy to write and there are a small number # of these files -- so I'm going to count the new contextually-corrected tokens by re-running them # through Volume. correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug) corrected = correct_tokens # If we are upvoting tokens in the header, they need to be added here. if len(pages) != len(headerlist): print(thisID + " fails a routine check of alignment between pages and headers.") else: for index, page in enumerate(pages): thispageheader = headerlist[index] header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug) headerdict = header_pages[0] for key, value in headerdict.items(): if key in meaningfulheaders: if key in page: page[key] += 2 # a fixed increment no matter how many times the word occurs in the # header else: page[key] = 2 print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.") # Write corrected file. cleanHTID = clean_pairtree(thisID) if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", "") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".norm.txt") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", "norm.txt") else: outHTID = cleanHTID + ".norm.txt" outfilename = outpath + "texts/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".norm.txt" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: for token in corrected: if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) if len(pages) != len(pagedata): perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID) return_dict["errors"] = perfileerrorlog return return_dict totalwordsinvol = 0 if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".pg.tsv") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", ".pg.tsv") else: outHTID = cleanHTID + ".pg.tsv" outfilename = outpath + "pagefeatures/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".pg.tsv" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: if metadata_evidence["biography"]: file.write("-1\t#metaBiography\t0\n") if metadata_evidence["drama"]: file.write("-1\t#metaDrama\t0\n") if metadata_evidence["fiction"]: file.write("-1\t#metaFiction\t0\n") if metadata_evidence["poetry"]: file.write("-1\t#metaPoetry\t0\n") numberofpages = len(pages) for index, page in enumerate(pages): # This is a shameful hack that should be deleted later. if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages: continue if testrun and "untypical" in page and (index +2) > numberofpages: continue otherfeatures = 0 for feature, count in page.items(): if feature in pagevocabset or feature.startswith("#"): outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' # pagenumber, featurename, featurecount file.write(outline) else: otherfeatures += count if not feature.startswith("#"): totalwordsinvol += count # This is because there are structural features like #allcapswords # that should not be counted toward total token count. structural_features = pagedata[index] for feature, count in structural_features.items(): if count > 0 or feature == "#textlines": outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' file.write(outline) if otherfeatures > 0: outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n' file.write(outline) metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english)) return_dict["metadata"] = metatuple return_dict["errors"] = perfileerrorlog return return_dict
def process_a_file(file_tuple): global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders thisID, metadata_evidence = file_tuple perfileerrorlog = list() return_dict = dict() return_dict["htid"] = thisID return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0") return_dict["errors"] = [] return_dict["phrasecounts"] = dict() if testrun: cleanID = clean_pairtree(thisID.replace("norm.txt", "")) else: cleanID = clean_pairtree(thisID) if not testrun: filepath, postfix = FileCabinet.pairtreepath(thisID, datapath) filename = filepath + postfix + '/' + postfix + ".zip" else: filename = datapath + thisID # ACTUALLY READ THE FILE. if filename.endswith('.zip'): pagelist, successflag = read_zip(filename) else: pagelist, successflag = read_txt(filename) if successflag == "missing file": print(thisID + " is missing.") perfileerrorlog.append(thisID + '\t' + "missing") return_dict["errors"] = perfileerrorlog return return_dict elif successflag == "pagination error": print(thisID + " has a pagination problem.") perfileerrorlog.append(thisID + '\t' + "paginationerror") return_dict["errors"] = perfileerrorlog return return_dict tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug) if pre_english < 0.6: perfileerrorlog.append(thisID + '\t' + "not english") tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) perfileerrorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems. # This algorithm works adequately. errors = 1 truths = 1 # Initialized to 1 as a Laplacian correction. for word in felecterrors: errors = errors + masterdict.get(word, 0) for word in selecttruths: truths = truths + masterdict.get(word, 0) if truths > errors: LongSproblem = False else: LongSproblem = True if LongSproblem == False: corrected = correct_tokens deleted = dict() added = dict() else: deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug) # okay, this is crazy and not efficient to run, but it's easy to write and there are a small number # of these files -- so I'm going to count the new contextually-corrected tokens by re-running them # through Volume. correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug) corrected = correct_tokens # If we are upvoting tokens in the header, they need to be added here. for index, page in enumerate(pages): thispageheader = headerlist[index] header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug) headerdict = header_pages[0] for key, value in headerdict.items(): if key in meaningfulheaders: if key in page: page[key] += 2 # a fixed increment no matter how many times the word occurs in the # header else: page[key] = 2 print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.") # Write corrected file. cleanHTID = clean_pairtree(thisID) if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", "") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".norm.txt") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", "norm.txt") else: outHTID = cleanHTID + ".norm.txt" outfilename = outpath + "texts/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".norm.txt" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: for token in corrected: if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')): token = token + " " file.write(token) if len(pages) != len(pagedata): perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID) totalwordsinvol = 0 if testrun: if cleanHTID.endswith(".clean.txt"): outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv") elif cleanHTID.endswith("norm.txt"): outHTID = cleanHTID.replace("norm.txt", ".pg.tsv") elif cleanHTID.endswith(".txt"): outHTID = cleanHTID.replace(".txt", ".pg.tsv") else: outHTID = cleanHTID + ".pg.tsv" outfilename = outpath + "pagefeatures/" + outHTID else: outfilename = filepath + postfix + '/' + postfix + ".pg.tsv" with open(outfilename, mode = 'w', encoding = 'utf-8') as file: if metadata_evidence["biography"]: file.write("-1\t#metaBiography\t0\n") if metadata_evidence["drama"]: file.write("-1\t#metaDrama\t0\n") if metadata_evidence["fiction"]: file.write("-1\t#metaFiction\t0\n") if metadata_evidence["poetry"]: file.write("-1\t#metaPoetry\t0\n") numberofpages = len(pages) for index, page in enumerate(pages): # This is a shameful hack that should be deleted later. if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages: continue if testrun and "untypical" in page and (index +2) > numberofpages: continue otherfeatures = 0 for feature, count in page.items(): if feature in pagevocabset or feature.startswith("#"): outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' # pagenumber, featurename, featurecount file.write(outline) else: otherfeatures += count if not feature.startswith("#"): totalwordsinvol += count # This is because there are structural features like #allcapswords # that should not be counted toward total token count. structural_features = pagedata[index] for feature, count in structural_features.items(): if count > 0 or feature == "#textlines": outline = str(index) + '\t' + feature + '\t' + str(count) + '\n' file.write(outline) if otherfeatures > 0: outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n' file.write(outline) metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english)) return_dict["metadata"] = metatuple return_dict["errors"] = perfileerrorlog return return_dict
pathdictionary = FileCabinet.loadpathdictionary('/Users/tunder/Dropbox/PythonScripts/workflow/PathDictionary.txt') datapath = pathdictionary['datapath'] metadatapath = pathdictionary['metadatapath'] metaoutpath = pathdictionary['metaoutpath'] outpath = pathdictionary['outpath'] targetfile = sys.argv[1] with open(targetfile, encoding='utf-8') as f: text = f.readlines() tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream([text], verbose=debug) correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] for key, value in masterdict.items(): if not key.startswith('#'): print(key + '\t' + str(value))
# with open(headeroutpath, mode="a", encoding="utf-8") as f: # for astream in headerlist: # if len(astream) > 0: # outline = " ".join([x for x in astream]) # f.write(outline + '\n') # f.write("--------- " + thisID + " ---------\n") # # === commented out because we don't really need to write these === tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) errorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream( tokens, verbose=debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems.
# with open(headeroutpath, mode="a", encoding="utf-8") as f: # for astream in headerlist: # if len(astream) > 0: # outline = " ".join([x for x in astream]) # f.write(outline + '\n') # f.write("--------- " + thisID + " ---------\n") # # === commented out because we don't really need to write these === tokencount = len(tokens) if len(tokens) < 10: print(thisID, "has only tokencount", len(tokens)) errorlog.append(thisID + '\t' + 'short') correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug) # Combine page dictionaries into a master dictionary. # If you ask, why didn't you just produce one in the first place? ... # answer has to do with flexibility of the Volume module for other purposes. pagecounter = 0 masterdict = dict() for page in pages: for item in page: if item in masterdict: masterdict[item] += page[item] else: masterdict[item] = page[item] # Now that we have a master dictionary, consider whether there are long-s problems.