Exemple #1
0
def fuzzy_restrictions(outfile,
                       metadata,
                       corrections,
                       data_path,
                       threshold=95):
    with open(outfile, 'wb') as outfile:
        writer = csv.writer(outfile,
                            delimiter=',',
                            quotechar='\'',
                            quoting=csv.QUOTE_MINIMAL)
        lookup = yule_k.create_metadata_lookup(metadata)
        correction_lookup = yule_k.create_correction_lookup(corrections)

        seen_books = set()

        for filename in yule_k.retrieve_tsvs(data_path):

            htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
            if lookup[htid]['title'] not in seen_books:
                print filename
                fuzzy_matches = [
                    title for title in seen_books if fuzz.token_set_ratio(
                        title, lookup[htid]['title']) >= threshold
                ]
                if not fuzzy_matches:
                    seen_books.add(lookup[htid]['title'])
                    tsv_table = yule_k.read_tsv_file(filename,
                                                     correction_lookup)
                    k = yule_k.calculate_k([val for val in tsv_table.values()])
                    writer.writerow([
                        htid, lookup[htid]['title'], lookup[htid]['author'],
                        lookup[htid]['date'], k
                    ])
Exemple #2
0
def as_single_corpus(metadata, corrections, data_path):
	corpus_table = defaultdict(int)
	lookup = yule_k.create_metadata_lookup(metadata)
	correction_lookup = yule_k.create_correction_lookup(corrections)
	for filename in yule_k.retrieve_tsvs(data_path):
		print filename
		htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
		for item, val in yule_k.read_tsv_file(filename, correction_lookup).items():
			corpus_table[item] += val

	print yule_k.calculate_k([val for val in corpus_table.values()])
Exemple #3
0
def non_fuzzy(outfile, metadata, corrections, data_path):
	with open(outfile, 'wb') as outfile:
		writer = csv.writer(outfile, delimiter=',',quotechar='\'', quoting=csv.QUOTE_MINIMAL)
		lookup = yule_k.create_metadata_lookup(metadata)
		correction_lookup = yule_k.create_correction_lookup(corrections)
		for filename in yule_k.retrieve_tsvs(data_path):
			print filename
			htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
			tsv_table = yule_k.read_tsv_file(filename, correction_lookup)
			k = yule_k.calculate_k([val for val in tsv_table.values()])
			writer.writerow([htid, lookup[htid]['title'],lookup[htid]['author'],lookup[htid]['date'], k])
Exemple #4
0
def as_single_corpus(metadata, corrections, data_path):
    corpus_table = defaultdict(int)
    lookup = yule_k.create_metadata_lookup(metadata)
    correction_lookup = yule_k.create_correction_lookup(corrections)
    for filename in yule_k.retrieve_tsvs(data_path):
        print filename
        htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
        for item, val in yule_k.read_tsv_file(filename,
                                              correction_lookup).items():
            corpus_table[item] += val

    print yule_k.calculate_k([val for val in corpus_table.values()])
Exemple #5
0
def non_fuzzy(outfile, metadata, corrections, data_path):
    with open(outfile, 'wb') as outfile:
        writer = csv.writer(outfile,
                            delimiter=',',
                            quotechar='\'',
                            quoting=csv.QUOTE_MINIMAL)
        lookup = yule_k.create_metadata_lookup(metadata)
        correction_lookup = yule_k.create_correction_lookup(corrections)
        for filename in yule_k.retrieve_tsvs(data_path):
            print filename
            htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
            tsv_table = yule_k.read_tsv_file(filename, correction_lookup)
            k = yule_k.calculate_k([val for val in tsv_table.values()])
            writer.writerow([
                htid, lookup[htid]['title'], lookup[htid]['author'],
                lookup[htid]['date'], k
            ])
Exemple #6
0
def fuzzy_restrictions(outfile, metadata, corrections, data_path, threshold=95):
	with open(outfile, 'wb') as outfile:
		writer = csv.writer(outfile, delimiter=',',quotechar='\'', quoting=csv.QUOTE_MINIMAL)
		lookup = yule_k.create_metadata_lookup(metadata)
		correction_lookup = yule_k.create_correction_lookup(corrections)

		seen_books = set()

		for filename in yule_k.retrieve_tsvs(data_path):

			htid = re.sub('\.tsv', '', os.path.split(filename)[1], count=1)
			if lookup[htid]['title'] not in seen_books:
				print filename
				fuzzy_matches = [title for title in seen_books if fuzz.token_set_ratio(title, lookup[htid]['title']) >= threshold]
				if not fuzzy_matches:
					seen_books.add(lookup[htid]['title'])
					tsv_table = yule_k.read_tsv_file(filename, correction_lookup)
					k = yule_k.calculate_k([val for val in tsv_table.values()])
					writer.writerow([htid, lookup[htid]['title'],lookup[htid]['author'],lookup[htid]['date'], k])