Exemple #1
0
# cleanse.py: Command line utility to read in all the messy names from
# the competition and write out a file that makes it easy to assess
# the cleansing function.

import loader
from collections import Counter

if __name__ == '__main__':
    print 'loading data'
    paths = loader.load_paths_file('data/paths.txt')
    graphs = loader.load_all_train_files()

    print 'convert to lists from tuples for assignment'
    paths = [[x for x in path] for path in paths]
    graphs = [[[x for x in edge] for edge in graph] for graph in graphs]
    
    print 'getting all names'
    names = loader.get_all_nodes_dict(graphs, paths)

    print 'splitting off text names for corrector'
    text_names = {}
    for name, count in names.items():
        if not name.isdigit():
            text_names[name] = count

    print 'making word lookup'
    word_lookup = loader.make_word_lookup(text_names)
    
    print 'getting name corrections'
    name_found, edits = loader.get_name_found(text_names, word_lookup)
import numpy as np
import loader
from collections import defaultdict

train_file_format = 'train/train%d.txt'
train_file_low = 1
train_file_high = 15
test_times = 5
paths_file = 'paths.txt'
submission_file = 'submission.csv'

if __name__ == '__main__':

    print 'loading data'
    paths = loader.load_paths_file(paths_file)
    graphs = loader.load_train_files(train_file_format,
                                     train_file_low, train_file_high)
    m = len(paths)
    n = test_times
    # pred = np.zeros((m,n))
    pred = np.random.rand(m,n)

    print 'training node name decoder model'
    print 'get all nodes'
    names = loader.get_all_nodes_dict(graphs, paths)
    print 'make word lookup'
    word_lookup = loader.make_word_lookup(names)
    print 'get name found'
    name_found, edits = loader.get_name_found(names, word_lookup)