Python FileCabinet.pairtreepath 예제들

프로그래밍 언어: Python

클래스/타입: FileCabinet

메소드/함수: pairtreepath

hotexamples.com에서의 예제들: 8

Python FileCabinet.pairtreepath - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 FileCabinet.pairtreepath 패키지로부터 peppy에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

loadpathdictionary(5)

pairtreepath(4)

get_most_recent(2)

get_wordcounts(2)

get_entries_by_date(1)

get_one(1)

get_pairedpaths(1)

get_wordfreqs(1)

parse_authordate(1)

예제 #1

파일 보기

def process_a_file(file_tuple):
	global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders

	thisID, metadata_evidence = file_tuple

	perfileerrorlog = list()
	return_dict = dict()
	return_dict["htid"] = thisID
	return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0")
	return_dict["errors"] = []
	return_dict["phrasecounts"] = dict()

	if testrun:
		cleanID = clean_pairtree(thisID.replace("norm.txt", ""))
	else:
		cleanID = clean_pairtree(thisID)

	if not testrun:
		filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
		filename = filepath + postfix + '/' + postfix + ".zip"
	else:
		filename = datapath + thisID

	# ACTUALLY READ THE FILE.

	if filename.endswith('.zip'):
		pagelist, successflag = read_zip(filename)
	else:
		pagelist, successflag = read_txt(filename)

	if successflag == "missing file":
		print(thisID + " is missing.")
		perfileerrorlog.append(thisID + '\t' + "missing")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "pagination error":
		print(thisID + " has a pagination problem.")
		perfileerrorlog.append(thisID + '\t' + "paginationerror")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "unicode error":
		print(thisID + " can not be decoded by unicode.")
		perfileerrorlog.append(thisID + '\t' + "unicode error")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug)

	if pre_english < 0.6:
		perfileerrorlog.append(thisID + '\t' + "not english")

	tokencount = len(tokens)

	if len(tokens) < 10:
		print(thisID, "has only tokencount", len(tokens))
		perfileerrorlog.append(thisID + '\t' + 'short')

	correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

	# Combine page dictionaries into a master dictionary.
	# If you ask, why didn't you just produce one in the first place? ...
	# answer has to do with flexibility of the Volume module for other purposes.

	pagecounter = 0
	masterdict = dict()
	for page in pages:
		for item in page:
			if item in masterdict:
				masterdict[item] += page[item]
			else:
				masterdict[item] = page[item]

	# Now that we have a master dictionary, consider whether there are long-s problems.
	# This algorithm works adequately.

	errors = 1
	truths = 1
	# Initialized to 1 as a Laplacian correction.

	for word in felecterrors:
		errors = errors + masterdict.get(word, 0)
	for word in selecttruths:
		truths = truths + masterdict.get(word, 0)

	if truths > errors:
		LongSproblem = False
	else:
		LongSproblem = True

	if LongSproblem == False:
		corrected = correct_tokens
		deleted = dict()
		added = dict()
	else:
		deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug)
		# okay, this is crazy and not efficient to run, but it's easy to write and there are a small number
		# of these files -- so I'm going to count the new contextually-corrected tokens by re-running them
		# through Volume.
		correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug)

		corrected = correct_tokens

	# If we are upvoting tokens in the header, they need to be added here.

	if len(pages) != len(headerlist):
		print(thisID + " fails a routine check of alignment between pages and headers.")
	else:
		for index, page in enumerate(pages):
			thispageheader = headerlist[index]
			header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug)
			headerdict = header_pages[0]
			for key, value in headerdict.items():
				if key in meaningfulheaders:
					if key in page:
						page[key] += 2
						# a fixed increment no matter how many times the word occurs in the
						# header
					else:
						page[key] = 2
						print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.")

	# Write corrected file.
	cleanHTID = clean_pairtree(thisID)

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", "")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".norm.txt")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", "norm.txt")
		else:
			outHTID = cleanHTID + ".norm.txt"

		outfilename = outpath + "texts/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".norm.txt"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
		for token in corrected:
			if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')):
				token = token + " "
			file.write(token)

	if len(pages) != len(pagedata):
		perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID)
		return_dict["errors"] = perfileerrorlog
		return return_dict

	totalwordsinvol = 0

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".pg.tsv")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", ".pg.tsv")
		else:
			outHTID = cleanHTID + ".pg.tsv"

		outfilename = outpath + "pagefeatures/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".pg.tsv"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:

		if metadata_evidence["biography"]:
			file.write("-1\t#metaBiography\t0\n")

		if metadata_evidence["drama"]:
			file.write("-1\t#metaDrama\t0\n")

		if metadata_evidence["fiction"]:
			file.write("-1\t#metaFiction\t0\n")

		if metadata_evidence["poetry"]:
			file.write("-1\t#metaPoetry\t0\n")

		numberofpages = len(pages)
		for index, page in enumerate(pages):

			# This is a shameful hack that should be deleted later.
			if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages:
				continue
			if testrun and "untypical" in page and (index +2) > numberofpages:
				continue

			otherfeatures = 0

			for feature, count in page.items():
				if feature in pagevocabset or feature.startswith("#"):
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					# pagenumber, featurename, featurecount
					file.write(outline)
				else:
					otherfeatures += count

				if not feature.startswith("#"):
					totalwordsinvol += count
				# This is because there are structural features like #allcapswords
				# that should not be counted toward total token count.

			structural_features = pagedata[index]
			for feature, count in structural_features.items():
				if count > 0 or feature == "#textlines":
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					file.write(outline)

			if otherfeatures > 0:
				outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n'
				file.write(outline)

	metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english))

	return_dict["metadata"] = metatuple
	return_dict["errors"] = perfileerrorlog

	return return_dict

예제 #2

파일 보기

파일: MultiNormalizeOCR.py 프로젝트: cmchurch/DataMunging

def process_a_file(file_tuple):
	global testrun, pairtreepath, datapath, genremapdir, felecterrors, selecttruths, debug, phraseset, pagevocabset, meaningfulheaders

	thisID, metadata_evidence = file_tuple

	perfileerrorlog = list()
	return_dict = dict()
	return_dict["htid"] = thisID
	return_dict["metadata"] = (thisID, "0", "0", "0", "0", "0")
	return_dict["errors"] = []
	return_dict["phrasecounts"] = dict()

	if testrun:
		cleanID = clean_pairtree(thisID.replace("norm.txt", ""))
	else:
		cleanID = clean_pairtree(thisID)

	if not testrun:
		filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
		filename = filepath + postfix + '/' + postfix + ".zip"
	else:
		filename = datapath + thisID

	# ACTUALLY READ THE FILE.

	if filename.endswith('.zip'):
		pagelist, successflag = read_zip(filename)
	else:
		pagelist, successflag = read_txt(filename)

	if successflag == "missing file":
		print(thisID + " is missing.")
		perfileerrorlog.append(thisID + '\t' + "missing")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	elif successflag == "pagination error":
		print(thisID + " has a pagination problem.")
		perfileerrorlog.append(thisID + '\t' + "paginationerror")
		return_dict["errors"] = perfileerrorlog
		return return_dict

	tokens, pre_matched, pre_english, pagedata, headerlist = NormalizeVolume.as_stream(pagelist, verbose=debug)

	if pre_english < 0.6:
		perfileerrorlog.append(thisID + '\t' + "not english")

	tokencount = len(tokens)

	if len(tokens) < 10:
		print(thisID, "has only tokencount", len(tokens))
		perfileerrorlog.append(thisID + '\t' + 'short')

	correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(tokens, verbose = debug)

	# Combine page dictionaries into a master dictionary.
	# If you ask, why didn't you just produce one in the first place? ...
	# answer has to do with flexibility of the Volume module for other purposes.

	pagecounter = 0
	masterdict = dict()
	for page in pages:
		for item in page:
			if item in masterdict:
				masterdict[item] += page[item]
			else:
				masterdict[item] = page[item]

	# Now that we have a master dictionary, consider whether there are long-s problems.
	# This algorithm works adequately.

	errors = 1
	truths = 1
	# Initialized to 1 as a Laplacian correction.

	for word in felecterrors:
		errors = errors + masterdict.get(word, 0)
	for word in selecttruths:
		truths = truths + masterdict.get(word, 0)

	if truths > errors:
		LongSproblem = False
	else:
		LongSproblem = True

	if LongSproblem == False:
		corrected = correct_tokens
		deleted = dict()
		added = dict()
	else:
		deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug)
		# okay, this is crazy and not efficient to run, but it's easy to write and there are a small number
		# of these files -- so I'm going to count the new contextually-corrected tokens by re-running them
		# through Volume.
		correct_tokens, pages, post_matched, post_english = NormalizeVolume.correct_stream(corrected, verbose = debug)

		corrected = correct_tokens

	# If we are upvoting tokens in the header, they need to be added here.

	for index, page in enumerate(pages):
		thispageheader = headerlist[index]
		header_tokens, header_pages, dummy1, dummy2 = NormalizeVolume.correct_stream(thispageheader, verbose = debug)
		headerdict = header_pages[0]
		for key, value in headerdict.items():
			if key in meaningfulheaders:
				if key in page:
					page[key] += 2
					# a fixed increment no matter how many times the word occurs in the
					# header
				else:
					page[key] = 2
					print("Word " + key + " in headerdict for " + thisID + " at " + str(index) + " but not main page.")

	# Write corrected file.
	cleanHTID = clean_pairtree(thisID)

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", "")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".norm.txt")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", "norm.txt")
		else:
			outHTID = cleanHTID + ".norm.txt"

		outfilename = outpath + "texts/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".norm.txt"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
		for token in corrected:
			if token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')):
				token = token + " "
			file.write(token)

	if len(pages) != len(pagedata):
		perfileerrorlog.append("Discrepancy between page data and page metadata in \t" + thisID)

	totalwordsinvol = 0

	if testrun:
		if cleanHTID.endswith(".clean.txt"):
			outHTID = cleanHTID.replace(".clean.txt", ".pg.tsv")
		elif cleanHTID.endswith("norm.txt"):
			outHTID = cleanHTID.replace("norm.txt", ".pg.tsv")
		elif cleanHTID.endswith(".txt"):
			outHTID = cleanHTID.replace(".txt", ".pg.tsv")
		else:
			outHTID = cleanHTID + ".pg.tsv"

		outfilename = outpath + "pagefeatures/" + outHTID
	else:
		outfilename = filepath + postfix + '/' + postfix + ".pg.tsv"

	with open(outfilename, mode = 'w', encoding = 'utf-8') as file:

		if metadata_evidence["biography"]:
			file.write("-1\t#metaBiography\t0\n")

		if metadata_evidence["drama"]:
			file.write("-1\t#metaDrama\t0\n")

		if metadata_evidence["fiction"]:
			file.write("-1\t#metaFiction\t0\n")

		if metadata_evidence["poetry"]:
			file.write("-1\t#metaPoetry\t0\n")

		numberofpages = len(pages)
		for index, page in enumerate(pages):

			# This is a shameful hack that should be deleted later.
			if testrun and "estimated" in page and "percentage" in page and (index + 3) > numberofpages:
				continue
			if testrun and "untypical" in page and (index +2) > numberofpages:
				continue

			otherfeatures = 0

			for feature, count in page.items():
				if feature in pagevocabset or feature.startswith("#"):
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					# pagenumber, featurename, featurecount
					file.write(outline)
				else:
					otherfeatures += count

				if not feature.startswith("#"):
					totalwordsinvol += count
				# This is because there are structural features like #allcapswords
				# that should not be counted toward total token count.

			structural_features = pagedata[index]
			for feature, count in structural_features.items():
				if count > 0 or feature == "#textlines":
					outline = str(index) + '\t' + feature + '\t' + str(count) + '\n'
					file.write(outline)

			if otherfeatures > 0:
				outline = str(index) + '\t' + "wordNotInVocab" + '\t' + str(otherfeatures) + '\n'
				file.write(outline)

	metatuple = (thisID, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched), str(post_english))

	return_dict["metadata"] = metatuple
	return_dict["errors"] = perfileerrorlog

	return return_dict

예제 #3

파일 보기

Lexicon = Dictionary.BuildLexicon(dictionarypath, debug)

writename = 'typeindex.txt'

delim = '\t'

if startindex == 0:
    BigIndex = dict()
else:
    BigIndex = TypeIndex.ReadIndex(writename, delim, debug)

SortedIndex = list()

for index in range(startindex, endindex):
    IDtoprocess = HTIDlist[index].strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue

    tokens = TokenGen.keep_hyphens(lines, Lexicon, verbose=debug)

예제 #4

파일 보기

def main():
    import FileCabinet
    import FileUtils
    import Volume2
    import Context
    import sys
    import os

    # DEFINE CONSTANTS.
    delim = '\t'
    debug = False
    felecterrors = [
        'fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft',
        'fat', 'fix', 'chafe', 'loft'
    ]
    selecttruths = [
        'see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast',
        'sat', 'six', 'chase', 'lost'
    ]

    # Locate ourselves in the directory structure.

    cwd = os.getcwd()
    cwdparent = os.path.abspath(os.path.join(cwd, os.pardir))

    # We need to find a directory called 'rulesets,' which we expect to be located
    # either within the working directory or adjacent to it.

    if os.path.isdir(os.path.join(cwd, "rulesets")):
        rulepath = os.path.join(cwd, "rulesets")
    elif os.path.isdir(os.path.join(cwdparent, "rulesets")):
        rulepath = os.path.join(cwdparent, "rulesets")
    else:
        user = input("Please specify a path to the ruleset directory: ")
        if os.path.isdir(user):
            rulepath = user
        else:
            print("Invalid path.")
            sys.exit()

    # Use rulepath to load relevant rules inside modules.

    Volume2.importrules(rulepath)
    Context.importrules(rulepath)

    # Now we enter dialogue with the user. This is all a little 1982,
    # but what can I say? Wetware upgrades are expensive.

    def prompt(promptstring, options):
        user = input(promptstring)
        if user not in options:
            user = prompt(promptstring, options)
        return user

    # Ask the user to tell us how to find files to process.
    print("****************** CorrectOCR 0.1 ******************")
    print()
    print("Do you want the full spiel (explanations, caveats, etc.)")
    user = prompt("y/n : ", ["y", "n"])

    if user.lower() == "y":
        spielpath = os.path.join(cwd, "spiel.txt")
        with open(spielpath, encoding='utf-8') as file:
            filelines = file.readlines()
        for line in filelines:
            print(line, end='')

    print("\nThis script will correct .txt files, or extract text")
    print("from zipped archives containing one txt file for each page.")
    print("In either case it writes the cleaned files back to their")
    print("original locations with the new suffix '.clean.txt'.")
    print("\nDo you want to unpack .zip files or .txt files?")
    user = prompt("zip or txt: ", ["zip", "txt"])
    suffix = "." + user
    suffixlen = len(suffix)

    print("\nThere are two ways to identify the location of the")
    print("files to be corrected.")
    print("\n1. Provide the path to a folder that contains them. I'll")
    print("recursively search subdirectories of that folder as well. Or,")
    print("\n2. Provide a file holding a list of pairtree file identifiers,")
    print("e.g. HathiTrust Volume IDs. I can use those identifiers to infer")
    print("the paths to the files themselves.\n")

    user = prompt("Which option do you prefer (1 or 2)? ", ["1", "2"])

    if user == "1":
        rootpath = input("Path to the folder that contains source files: ")
        filelist = FileUtils.recursivefilegetter(rootpath, suffix)

    else:
        print("I expect the pairtree identifiers to be listed one per line,")
        print("and to be the only item on a line.")
        filepath = input(
            "Path to the file that contains pairtree identifiers: ")
        filelist = list()
        with open(filepath, encoding='utf-8') as file:
            filelines = file.readlines()

        print(
            "Now I need a path to the folder that contains the pairtree structure."
        )
        print(
            "If you have multiple folders for different libraries, this should be"
        )
        print("the folder above them all. It should end with a slash.")
        rootpath = input("Path to the folder that contains pairtree: ")
        for line in filelines:
            line = line.rstrip()
            filepath, postfix = FileCabinet.pairtreepath(line, rootpath)
            filename = filepath + postfix + '/' + postfix + suffix
            filelist.append(filename)

    print("\nI identified", len(filelist), "files in that location.")

    print("\nI can just write clean text files (with suffix clean.txt)")
    print("or I can also write tab-separated files that count the words")
    print("in each file after correction.")
    user = prompt("1) Text only or 2) text-plus-wordcounts? (1 or 2): ",
                  ["1", "2"])
    if user == "1":
        wordcountflag = False
    else:
        wordcountflag = True

    print("Now proceeding to process the files.\n")

    def subtract_counts(token, adict, tosubtract):
        '''Adjusts a dictionary by subtracting tosubtract instances of token.'''
        if token in adict:
            adict[token] = adict[token] - tosubtract
            if adict[token] < 0:
                del adict[token]
            elif adict[token] < 1:
                del adict[token]
        return adict

    def add_counts(token, adict, toadd):
        '''Adjusts a dictionary by adding toadd instances of token.'''
        if token in adict:
            adict[token] = adict[token] + toadd
        else:
            adict[token] = toadd
        return adict

    # Here's where we BEGIN THE ACTUAL CORRECTION OF FILES.

    processedmeta = list()
    errorlog = list()
    longSfiles = list()

    count = 0

    for filename in filelist:

        try:
            if suffix == ".zip":
                lines = FileUtils.readzip(filename)
                successflag = True
            else:
                with open(filename, encoding='utf-8') as file:
                    lines = file.readlines()
                    successflag = True
        except IOError as e:
            successflag = False

        if not successflag:
            print(filename + " is missing.")
            errorlog.append(filename + '\t' + "missing")
            continue

        tokens, pre_matched, pre_english = Volume2.as_stream(lines,
                                                             verbose=debug)

        tokencount = len(tokens)

        if len(tokens) < 10:
            print(filename, "has only tokencount", len(tokens))
            errorlog.append(filename + '\t' + 'short')

        correct_tokens, pages, post_matched, post_english = Volume2.correct_stream(
            tokens, verbose=debug)

        # Combine page dictionaries into a master dictionary.
        # If you ask, why didn't you just produce one in the first place? ...
        # answer has to do with flexibility of the Volume module for other purposes.

        pagecounter = 0
        masterdict = dict()
        for page in pages:
            for item in page:
                if item in masterdict:
                    masterdict[item] += page[item]
                else:
                    masterdict[item] = page[item]

        # Now that we have a master dictionary, consider whether there are long-s problems.
        # This algorithm works adequately.

        errors = 1
        truths = 1

        totaladded = 0
        totaldeleted = 0

        # Initialized to 1 as a Laplacian correction.

        for word in felecterrors:
            errors = errors + masterdict.get(word, 0)
        for word in selecttruths:
            truths = truths + masterdict.get(word, 0)

        if truths > errors:
            LongSproblem = False
        else:
            LongSproblem = True

        if LongSproblem == False:
            corrected = correct_tokens
        else:
            longSfiles.append(filename)
            deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(
                correct_tokens, debug)

            ## Adjust wordcounts to reflect contextual spellchecking.

            if wordcountflag:

                for word, count in deleted.items():
                    masterdict = subtract_counts(word, masterdict, count)
                    totaldeleted = totaldeleted + count

                for word, count in added.items():
                    masterdict = add_counts(word, masterdict, count)
                    totaladded = totaladded + count

        # Write corrected file.

        outfilename = filename[:-suffixlen] + ".clean.txt"

        with open(outfilename, mode='w', encoding='utf-8') as file:
            lasttoken = ""
            for token in corrected:
                if lasttoken == '\n' and (token == '"' or token == "'"):
                    token = token
                elif token != '\n' and token != "“" and not (
                        token.startswith('<') and token.endswith('>')):
                    token = token + " "

                file.write(token)
                lasttoken = token

        print(outfilename)

        ## If we're also writing wordcount files, we need to write the .tsv file.

        if wordcountflag:

            outlist = sorted(masterdict.items(),
                             key=lambda x: x[1],
                             reverse=True)

            outfilename = outfilename[:-10] + ".vol.tsv"
            totalwordsinvol = 0

            with open(outfilename, mode='w', encoding='utf-8') as file:
                for item in outlist:
                    outline = item[0] + delim + str(item[1]) + '\n'
                    file.write(outline)
                    totalwordsinvol += item[1]

            print(outfilename)

            metatuple = (outfilename, str(totalwordsinvol), str(pre_matched),
                         str(pre_english), str(post_matched),
                         str(post_english), str(totaladded), str(totaldeleted))
            processedmeta.append(metatuple)

        count += 1
        if count > 200:
            break

    # END ITERATION ACROSS FILES.

    # Write the errorlog and list of long S files.
    errorpath = FileUtils.clearpath(rootpath, "processingerrors.txt")
    longSpath = FileUtils.clearpath(rootpath, "longSfiles.txt")
    metapath = FileUtils.clearpath(rootpath, "processing_metadata.tsv")

    if len(errorlog) > 0:
        with open(errorpath, mode='w', encoding='utf-8') as file:
            for line in errorlog:
                file.write(line + '\n')
        print("Writing", errorpath)

    if len(longSfiles) > 0:
        with open(longSpath, mode='w', encoding='utf-8') as file:
            for line in longSfiles:
                file.write(line + '\n')
        print("Writing", longSpath)

    if len(processedmeta) > 0:
        with open(metapath, mode='w', encoding='utf8') as file:
            file.write(
                'filename\twordinvol\toriginallyindict\toriginallyenglish\tindictpostcorrection\tenglishpostcorrection\taddedbycontextmodule\tdeletedbycontextmodule\n'
            )
            for atuple in processedmeta:
                outline = '\t'.join(atuple) + '\n'
                file.write(outline)
        print("Writing", metapath)

예제 #5

파일 보기

파일: Tokenize.py 프로젝트: Borel1/GenreProject

for line in filelines:
    line = line.rstrip()
    line = line.split(delim)
    if line[0] in HTIDs:
        HTIDs.discard(line[0])

processedmeta = list()
errorlog = list()
longSfiles = list()
totaladded = dict()
totaldeleted = dict()

for thisID in HTIDs:

    filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
    filename = filepath + postfix + '/' + postfix + ".clean.txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(thisID + " is missing.")
        errorlog.append(thisID + '\t' + "missing")
        continue

    tokens, aretokensverse, pagefeatures = Volume.as_stream(lines,

예제 #6

파일 보기

파일: Tokenize.py 프로젝트: Borel1/GenreProject

for line in filelines:
    line = line.rstrip()
    line = line.split(delim)
    if line[0] in HTIDs:
        HTIDs.discard(line[0])

processedmeta = list()
errorlog = list()
longSfiles = list()
totaladded = dict()
totaldeleted = dict()

for thisID in HTIDs:
    
    filepath, postfix = FileCabinet.pairtreepath(thisID, datapath)
    filename = filepath + postfix + '/' + postfix + ".clean.txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(thisID + " is missing.")
        errorlog.append(thisID + '\t' + "missing")
        continue
        
    tokens, aretokensverse, pagefeatures = Volume.as_stream(lines, verbose=debug)

예제 #7

파일 보기

파일: SliceIndexer.py 프로젝트: Anterotesis/DataMunging

with open(HTIDfile, encoding="utf-8") as file:
    HTIDlist = file.readlines()

Lexicon = Dictionary.BuildLexicon(dictionarypath, debug)

writename = slicename + "IND.txt"

delim = '\t'

BigIndex = dict()

SortedIndex = list()

for IDtoprocess in HTIDlist:
    IDtoprocess = IDtoprocess.strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue
        
    tokens = TokenGen.keep_hyphens(lines,Lexicon,verbose=debug)

예제 #8

파일 보기

파일: OCRnormalizer.py 프로젝트: Anterotesis/DataMunging

def main():
    import FileCabinet
    import FileUtils
    import Volume2
    import Context
    import sys
    import os

    # DEFINE CONSTANTS.
    delim = '\t'
    debug = False
    felecterrors = ['fee', 'fea', 'fay', 'fays', 'fame', 'fell', 'funk', 'fold', 'haft', 'fat', 'fix', 'chafe', 'loft']
    selecttruths = ['see', 'sea', 'say', 'says', 'same', 'sell', 'sunk', 'sold', 'hast', 'sat', 'six', 'chase', 'lost']

    # Locate ourselves in the directory structure.

    cwd = os.getcwd()
    cwdparent = os.path.abspath(os.path.join(cwd, os.pardir))

    # We need to find a directory called 'rulesets,' which we expect to be located
    # either within the working directory or adjacent to it.

    if os.path.isdir(os.path.join(cwd, "rulesets")):
        rulepath = os.path.join(cwd, "rulesets")
    elif os.path.isdir(os.path.join(cwdparent, "rulesets")):
        rulepath = os.path.join(cwdparent, "rulesets")
    else:
        user = input("Please specify a path to the ruleset directory: ")
        if os.path.isdir(user):
            rulepath = user
        else:
            print("Invalid path.")
            sys.exit()

    # Use rulepath to load relevant rules inside modules.

    Volume2.importrules(rulepath)
    Context.importrules(rulepath)

    # Now we enter dialogue with the user. This is all a little 1982,
    # but what can I say? Wetware upgrades are expensive.

    def prompt(promptstring, options):
        user = input(promptstring)
        if user not in options:
            user = prompt(promptstring, options)
        return user

    # Ask the user to tell us how to find files to process.
    print("****************** CorrectOCR 0.1 ******************")
    print()
    print("Do you want the full spiel (explanations, caveats, etc.)")
    user = prompt("y/n : ", ["y", "n"])
    
    if user.lower() == "y":
        spielpath = os.path.join(cwd, "spiel.txt")
        with open(spielpath, encoding = 'utf-8') as file:
            filelines = file.readlines()
        for line in filelines:
            print(line, end='')

    print("\nThis script will correct .txt files, or extract text")
    print("from zipped archives containing one txt file for each page.")
    print("In either case it writes the cleaned files back to their")
    print("original locations with the new suffix '.clean.txt'.")
    print("\nDo you want to unpack .zip files or .txt files?")
    user = prompt("zip or txt: ", ["zip", "txt"])
    suffix = "." + user
    suffixlen = len(suffix)
    
    print("\nThere are two ways to identify the location of the")
    print("files to be corrected.")
    print("\n1. Provide the path to a folder that contains them. I'll")
    print("recursively search subdirectories of that folder as well. Or,")
    print("\n2. Provide a file holding a list of pairtree file identifiers,")
    print("e.g. HathiTrust Volume IDs. I can use those identifiers to infer")
    print("the paths to the files themselves.\n")

    user = prompt("Which option do you prefer (1 or 2)? ", ["1", "2"])
    
    if user == "1":
        rootpath = input("Path to the folder that contains source files: ")
        filelist = FileUtils.recursivefilegetter(rootpath, suffix)
 
    else:
        print("I expect the pairtree identifiers to be listed one per line,")
        print("and to be the only item on a line.")  
        filepath = input("Path to the file that contains pairtree identifiers: ")
        filelist = list()
        with open(filepath, encoding = 'utf-8') as file:
            filelines = file.readlines()

        print("Now I need a path to the folder that contains the pairtree structure.")
        print("If you have multiple folders for different libraries, this should be")
        print("the folder above them all. It should end with a slash.")
        rootpath = input("Path to the folder that contains pairtree: ")
        for line in filelines:
            line = line.rstrip()
            filepath, postfix = FileCabinet.pairtreepath(line, rootpath)
            filename = filepath + postfix + '/' + postfix + suffix
            filelist.append(filename)

    print("\nI identified", len(filelist), "files in that location.")

    print("\nI can just write clean text files (with suffix clean.txt)")
    print("or I can also write tab-separated files that count the words")
    print("in each file after correction.")
    user = prompt("1) Text only or 2) text-plus-wordcounts? (1 or 2): ", ["1", "2"])
    if user == "1":
        wordcountflag = False
    else:
        wordcountflag = True
    
    print("Now proceeding to process the files.\n")

    def subtract_counts (token, adict, tosubtract):
        '''Adjusts a dictionary by subtracting tosubtract instances of token.'''
        if token in adict:
            adict[token] = adict[token] - tosubtract
            if adict[token] < 0:
                del adict[token]
            elif adict[token] < 1:
                del adict[token]
        return adict

    def add_counts (token, adict, toadd):
        '''Adjusts a dictionary by adding toadd instances of token.'''
        if token in adict:
            adict[token] = adict[token] + toadd
        else:
            adict[token] = toadd
        return adict

    # Here's where we BEGIN THE ACTUAL CORRECTION OF FILES.
    
    processedmeta = list()
    errorlog = list()
    longSfiles = list()

    count = 0

    for filename in filelist:

        try:
            if suffix == ".zip":
                lines = FileUtils.readzip(filename)
                successflag = True
            else:
                with open(filename, encoding='utf-8') as file:
                    lines = file.readlines()
                    successflag = True
        except IOError as e:
            successflag = False

        if not successflag:
            print(filename + " is missing.")
            errorlog.append(filename + '\t' + "missing")
            continue
            
        tokens, pre_matched, pre_english = Volume2.as_stream(lines, verbose=debug)

        tokencount = len(tokens)
        
        if len(tokens) < 10:
            print(filename, "has only tokencount", len(tokens))
            errorlog.append(filename + '\t' + 'short')

        correct_tokens, pages, post_matched, post_english = Volume2.correct_stream(tokens, verbose = debug)

        # Combine page dictionaries into a master dictionary.
        # If you ask, why didn't you just produce one in the first place? ...
        # answer has to do with flexibility of the Volume module for other purposes.

        pagecounter = 0
        masterdict = dict()
        for page in pages:
            for item in page:
                if item in masterdict:
                    masterdict[item] += page[item]
                else:
                    masterdict[item] = page[item]

        # Now that we have a master dictionary, consider whether there are long-s problems.
        # This algorithm works adequately.

        errors = 1
        truths = 1

        totaladded = 0
        totaldeleted = 0
        
        # Initialized to 1 as a Laplacian correction.
        
        for word in felecterrors:
            errors = errors + masterdict.get(word, 0)
        for word in selecttruths:
            truths = truths + masterdict.get(word, 0)

        if truths > errors:
            LongSproblem = False
        else:
            LongSproblem = True

        if LongSproblem == False:
            corrected = correct_tokens
        else:
            longSfiles.append(filename)
            deleted, added, corrected, changedphrases, unchanged = Context.catch_ambiguities(correct_tokens, debug)

            ## Adjust wordcounts to reflect contextual spellchecking.

            if wordcountflag:
                
                for word, count in deleted.items():
                    masterdict = subtract_counts(word, masterdict, count)
                    totaldeleted = totaldeleted + count

                for word, count in added.items():
                    masterdict = add_counts(word, masterdict, count)
                    totaladded = totaladded + count

        # Write corrected file.
 
        outfilename = filename[:-suffixlen] + ".clean.txt"
        
        with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
            lasttoken = ""
            for token in corrected:
                if lasttoken == '\n' and (token == '"' or token == "'"):
                    token = token
                elif token != '\n' and token != "“" and not (token.startswith('<') and token.endswith('>')):
                    token = token + " "
                    
                file.write(token)
                lasttoken = token

        print(outfilename)

        ## If we're also writing wordcount files, we need to write the .tsv file.

        if wordcountflag:
                        
            outlist = sorted(masterdict.items(), key = lambda x: x[1], reverse = True)
            
            outfilename = outfilename[ :-10] + ".vol.tsv"
            totalwordsinvol = 0
            
            with open(outfilename, mode = 'w', encoding = 'utf-8') as file:
                for item in outlist:
                    outline = item[0] + delim + str(item[1]) + '\n'
                    file.write(outline)
                    totalwordsinvol += item[1]

            print(outfilename)

            metatuple = (outfilename, str(totalwordsinvol), str(pre_matched), str(pre_english), str(post_matched),
                         str(post_english), str(totaladded), str(totaldeleted))
            processedmeta.append(metatuple)
            
        count += 1
        if count > 200:
            break
                
    # END ITERATION ACROSS FILES.
    
    # Write the errorlog and list of long S files.
    errorpath = FileUtils.clearpath(rootpath, "processingerrors.txt")
    longSpath = FileUtils.clearpath(rootpath, "longSfiles.txt")
    metapath = FileUtils.clearpath(rootpath, "processing_metadata.tsv")
    
    if len(errorlog) > 0:
        with open(errorpath, mode = 'w', encoding = 'utf-8') as file:
            for line in errorlog:
                file.write(line + '\n')
        print("Writing", errorpath)

    if len(longSfiles) > 0:
        with open(longSpath, mode = 'w', encoding = 'utf-8') as file:
            for line in longSfiles:
                file.write(line + '\n')
        print("Writing", longSpath)

    if len(processedmeta) > 0:
        with open(metapath, mode = 'w', encoding = 'utf8') as file:
            file.write('filename\twordinvol\toriginallyindict\toriginallyenglish\tindictpostcorrection\tenglishpostcorrection\taddedbycontextmodule\tdeletedbycontextmodule\n')
            for atuple in processedmeta:
                outline = '\t'.join(atuple) + '\n'
                file.write(outline)
        print("Writing", metapath)