def make_clean_paper_author(confirmed_pairs, paper_author_pairs, output_file_path): with open(output_file_path, 'w+') as f: print >> f, 'PaperId,AuthorId,Name,Affiliation' for idx in confirmed_pairs: print >> f, '%d,%d,%s,%s' % ( paper_author_pairs[idx]['paper_id'], paper_author_pairs[idx]['author_id'], paper_author_pairs[idx]['author_name'], paper_author_pairs[idx]['author_affiliation']) if __name__ == '__main__': if len(sys.argv) != 1 + 5: print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % ( sys.argv[0], ) sys.exit(-1) train_csv = loader.load_train_csv(sys.argv[1]) train_csv_dict = make_train_csv_dict(train_csv) paper_author_csv = loader.load_paper_author_csv(sys.argv[2]) test_csv = loader.load_test_csv(sys.argv[3]) labels = load_labels(sys.argv[4]) print 'Data loaded!' confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv), train_csv_dict, test_csv, labels) pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs) make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5]) sys.exit(0)
def make_train_csv_dict(train_csv): train_dict = dict() for author in train_csv: train_dict[author['author_id']] = author return train_dict def make_clean_paper_author(confirmed_pairs, paper_author_pairs, output_file_path): with open(output_file_path, 'w+') as f: print >> f, 'PaperId,AuthorId,Name,Affiliation' for idx in confirmed_pairs: print >> f, '%d,%d,%s,%s' % (paper_author_pairs[idx]['paper_id'], paper_author_pairs[idx]['author_id'], paper_author_pairs[idx]['author_name'], paper_author_pairs[idx]['author_affiliation']) if __name__ == '__main__': if len(sys.argv) != 1 + 5: print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % (sys.argv[0], ) sys.exit(-1) train_csv = loader.load_train_csv(sys.argv[1]) train_csv_dict = make_train_csv_dict(train_csv) paper_author_csv = loader.load_paper_author_csv(sys.argv[2]) test_csv = loader.load_test_csv(sys.argv[3]) labels = load_labels(sys.argv[4]) print 'Data loaded!' confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv), train_csv_dict, test_csv, labels) pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs) make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5]) sys.exit(0)
return authors def make_test_file(authors, test_file_path): with open(test_file_path, 'w+') as f: f.write("AuthorId,PaperIds\n") for author_id in authors: f.write(str(author_id) + ',') f.write(' '.join([str(x) for x in authors[author_id]])) f.write('\n') if __name__ == '__main__': if len(sys.argv) != 1 + 4: print >> sys.stderr, "Usage : %s Author.csv Paper.csv PaperAuthor.csv OutputFile" % (sys.argv[0],) sys.exit(-1) authors = loader.load_author_csv(sys.argv[1]) papers = loader.load_paper_csv(sys.argv[2]) paper_author_pairs = loader.load_paper_author_csv(sys.argv[3]) print 'Data loaded' print 'Generating' test_pairs = generate_pairs(authors, papers, paper_author_pairs) print 'Saving' make_test_file(test_pairs, sys.argv[4])