def make_train_csv_dict(train_csv):
    train_dict = dict()
    for author in train_csv:
        train_dict[author['author_id']] = author

    return train_dict

def make_clean_paper_author(confirmed_pairs, paper_author_pairs, output_file_path):
    with open(output_file_path, 'w+') as f:
        print >> f, 'PaperId,AuthorId,Name,Affiliation'
        for idx in confirmed_pairs:
            print >> f, '%d,%d,%s,%s' % (paper_author_pairs[idx]['paper_id'], paper_author_pairs[idx]['author_id'], paper_author_pairs[idx]['author_name'], paper_author_pairs[idx]['author_affiliation'])

if __name__ == '__main__':
    if len(sys.argv) != 1 + 5:
        print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % (sys.argv[0], )
        sys.exit(-1)

    train_csv = loader.load_train_csv(sys.argv[1])
    train_csv_dict = make_train_csv_dict(train_csv)
    paper_author_csv = loader.load_paper_author_csv(sys.argv[2])
    test_csv = loader.load_test_csv(sys.argv[3])
    labels = load_labels(sys.argv[4])
    print 'Data loaded!'
    confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv),
                                            train_csv_dict, test_csv, labels)
    pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs)
    make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5])
    sys.exit(0)

Example #2
0
def get_internal_author_ids(internal_train_csv):
    author_ids = dict()
    for rec in internal_train_csv:
        author_ids[rec['author_id']] = 1
    return author_ids


if __name__ == '__main__':
    if len(sys.argv) != 1 + 3:
        print >> sys.stderr, "Usage : %s feature_file internal_train_output internal_valid_output" % (sys.argv[0],)
        sys.exit(-1)

    train_csv = loader.load_train_csv(DATA['Train'])
    internal_train_csv = loader.load_train_csv(DATA['internal_train'])
    internal_valid_csv = loader.load_test_csv(DATA['internal_valid'])


    author_papers = dict()

    with open(sys.argv[1], 'r') as f:
        lines = f.read().split('\n')
        pair_count = 0
        for rec in train_csv:
            author_id = rec['author_id']
            paper_ids = rec['confirmed_paper_ids'] + rec['deleted_paper_ids']
            paper_id_count = len(paper_ids)
            author_papers[author_id] = dict()
            
            for idx, paper_id in enumerate(paper_ids):
                if paper_id not in author_papers[author_id]:
Example #3
0
def get_internal_author_ids(internal_train_csv):
    author_ids = dict()
    for rec in internal_train_csv:
        author_ids[rec['author_id']] = 1
    return author_ids


if __name__ == '__main__':
    if len(sys.argv) != 1 + 3:
        print >> sys.stderr, "Usage : %s feature_file internal_train_output internal_valid_output" % (
            sys.argv[0], )
        sys.exit(-1)

    train_csv = loader.load_train_csv(DATA['Train'])
    internal_train_csv = loader.load_train_csv(DATA['internal_train'])
    internal_valid_csv = loader.load_test_csv(DATA['internal_valid'])

    author_papers = dict()

    with open(sys.argv[1], 'r') as f:
        lines = f.read().split('\n')
        pair_count = 0
        for rec in train_csv:
            author_id = rec['author_id']
            paper_ids = rec['confirmed_paper_ids'] + rec['deleted_paper_ids']
            paper_id_count = len(paper_ids)
            author_papers[author_id] = dict()

            for idx, paper_id in enumerate(paper_ids):
                if paper_id not in author_papers[author_id]:
                    author_papers[author_id][paper_id] = lines[pair_count +
def make_clean_paper_author(confirmed_pairs, paper_author_pairs,
                            output_file_path):
    with open(output_file_path, 'w+') as f:
        print >> f, 'PaperId,AuthorId,Name,Affiliation'
        for idx in confirmed_pairs:
            print >> f, '%d,%d,%s,%s' % (
                paper_author_pairs[idx]['paper_id'],
                paper_author_pairs[idx]['author_id'],
                paper_author_pairs[idx]['author_name'],
                paper_author_pairs[idx]['author_affiliation'])


if __name__ == '__main__':
    if len(sys.argv) != 1 + 5:
        print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % (
            sys.argv[0], )
        sys.exit(-1)

    train_csv = loader.load_train_csv(sys.argv[1])
    train_csv_dict = make_train_csv_dict(train_csv)
    paper_author_csv = loader.load_paper_author_csv(sys.argv[2])
    test_csv = loader.load_test_csv(sys.argv[3])
    labels = load_labels(sys.argv[4])
    print 'Data loaded!'
    confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv),
                                          train_csv_dict, test_csv, labels)
    pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs)
    make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5])
    sys.exit(0)