# cleanse.py: Command line utility to read in all the messy names from # the competition and write out a file that makes it easy to assess # the cleansing function. import loader from collections import Counter if __name__ == '__main__': print 'loading data' paths = loader.load_paths_file('data/paths.txt') graphs = loader.load_all_train_files() print 'convert to lists from tuples for assignment' paths = [[x for x in path] for path in paths] graphs = [[[x for x in edge] for edge in graph] for graph in graphs] print 'getting all names' names = loader.get_all_nodes_dict(graphs, paths) print 'splitting off text names for corrector' text_names = {} for name, count in names.items(): if not name.isdigit(): text_names[name] = count print 'making word lookup' word_lookup = loader.make_word_lookup(text_names) print 'getting name corrections' name_found, edits = loader.get_name_found(text_names, word_lookup)
import numpy as np import loader from collections import defaultdict train_file_format = 'train/train%d.txt' train_file_low = 1 train_file_high = 15 test_times = 5 paths_file = 'paths.txt' submission_file = 'submission.csv' if __name__ == '__main__': print 'loading data' paths = loader.load_paths_file(paths_file) graphs = loader.load_train_files(train_file_format, train_file_low, train_file_high) m = len(paths) n = test_times # pred = np.zeros((m,n)) pred = np.random.rand(m,n) print 'training node name decoder model' print 'get all nodes' names = loader.get_all_nodes_dict(graphs, paths) print 'make word lookup' word_lookup = loader.make_word_lookup(names) print 'get name found' name_found, edits = loader.get_name_found(names, word_lookup)