# clean_edits.py # # Reads in a shorter names file and just tries to clean that. # from collections import defaultdict import numpy as np import loader if __name__ == '__main__': with open('names_short.txt') as infile: names = [name.strip() for name in infile.readlines()] ninstances = len(names) print 'read %d instances' % ninstances word_lookup = loader.make_word_lookup(names) print word_lookup correct = [] for name in names: words = name.split() for i in range(len(words)): words[i] = word_lookup[loader.sort_word(words[i])][0][0] correct.append(' '.join(words)) with open('corrected.txt', 'w') as outfile: for i in range(len(names)): outfile.write('%s\t%s\n' % (names[i], correct[i])) # for each name in order of descending frequency
print 'convert to lists from tuples for assignment' paths = [[x for x in path] for path in paths] graphs = [[[x for x in edge] for edge in graph] for graph in graphs] print 'getting all names' names = loader.get_all_nodes_dict(graphs, paths) print 'splitting off text names for corrector' text_names = {} for name, count in names.items(): if not name.isdigit(): text_names[name] = count print 'making word lookup' word_lookup = loader.make_word_lookup(text_names) print 'getting name corrections' name_found, edits = loader.get_name_found(text_names, word_lookup) print 'correcting graph names' graph_corrections = 0 for i in range(len(graphs)): for j in range(len(graphs[i])): for k in range(2): name = graphs[i][j][k] if not name.isdigit() and name in name_found: graphs[i][j][k] = name_found[name] graph_corrections += 1 print '%d corrections made' % graph_corrections