Ejemplo n.º 1
0
# clean_edits.py
#
# Reads in a shorter names file and just tries to clean that.
#

from collections import defaultdict
import numpy as np
import loader

if __name__ == '__main__':
    with open('names_short.txt') as infile:
        names = [name.strip() for name in infile.readlines()]
    ninstances = len(names)
    print 'read %d instances' % ninstances

    word_lookup = loader.make_word_lookup(names)
    print word_lookup

    correct = []
    for name in names:
        words = name.split()
        for i in range(len(words)):
            words[i] = word_lookup[loader.sort_word(words[i])][0][0]
        correct.append(' '.join(words))

    with open('corrected.txt', 'w') as outfile:
        for i in range(len(names)):
            outfile.write('%s\t%s\n' % (names[i], correct[i]))

    # for each name in order of descending frequency
    
Ejemplo n.º 2
0
    print 'convert to lists from tuples for assignment'
    paths = [[x for x in path] for path in paths]
    graphs = [[[x for x in edge] for edge in graph] for graph in graphs]
    
    print 'getting all names'
    names = loader.get_all_nodes_dict(graphs, paths)

    print 'splitting off text names for corrector'
    text_names = {}
    for name, count in names.items():
        if not name.isdigit():
            text_names[name] = count

    print 'making word lookup'
    word_lookup = loader.make_word_lookup(text_names)
    
    print 'getting name corrections'
    name_found, edits = loader.get_name_found(text_names, word_lookup)

    print 'correcting graph names'
    graph_corrections = 0
    for i in range(len(graphs)):
        for j in range(len(graphs[i])):
            for k in range(2):
                name = graphs[i][j][k]
                if not name.isdigit() and name in name_found:
                    graphs[i][j][k] = name_found[name]
                    graph_corrections += 1
    print '%d corrections made' % graph_corrections