def get_npapers(): print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() outf = open('npapers.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, len(papers[aid]), tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid() outf = open('npapers.valid.dat', 'w') for aid in validation: allPapers = validation[aid] for p1 in allPapers: print >> outf, aid, p1, len(papers[aid]) outf.close()
def get_sum_coauthors(): print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() #Create the number of collaborations between 2 authors num_collaborations = {} for author in authors.values(): for i in range(0,len(author)): for j in range(i+1,len(author)): #Always use the lowest author id as the first key if author[i]<author[j]: try: num_collaborations[(author[i], author[j])] += 1 except KeyError: num_collaborations[(author[i], author[j])] = 1 else: try: num_collaborations[(author[j], author[i])] += 1 except KeyError: num_collaborations[(author[j], author[i])] = 1 print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() print confirmed outf = open('sumcoauthors.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: sum_coauthors = 0 for author in authors[p1]: if author<aid: sum_coauthors += num_collaborations[(author,aid)] elif aid<author: sum_coauthors += num_collaborations[(aid,author)] if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, sum_coauthors, tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid() outf = open('sumcoauthors.valid.dat', 'w') for aid in validation: allPapers = validation[aid] for p1 in allPapers: sum_coauthors = 0 for author in authors[p1]: if author<aid: sum_coauthors += num_collaborations[(author,aid)] elif aid<author: sum_coauthors += num_collaborations[(aid,author)] print >> outf, aid, p1, sum_coauthors outf.close()
def get_nvenue(): print >> sys.stderr, 'Reading venue info...' journal, conference = get_venue() print >> sys.stderr, 'Reading author-paper info...' papers, authors = get_author_papers() print >> sys.stderr, 'Counting papers in journals...' aid2journal = build_author_venue_count(papers, journal) print >> sys.stderr, 'Counting papers in conferences...' aid2conference = build_author_venue_count(papers, conference) print >> sys.stderr, 'Training set...' confirmed, deleted = get_train() outf = open('nvenue.train.dat', 'w') for aid in confirmed: for p1 in [p for p in confirmed[aid] + deleted[aid]]: if p1 in confirmed[aid]: tf = 'T' elif p1 in deleted[aid]: tf = 'F' else: raise WhatTheFuck try: sj = aid2journal[aid][journal[p1]] except KeyError: sj = aid2journal[aid][-1] try: sc = aid2conference[aid][conference[p1]] except KeyError: sc = aid2conference[aid][-1] print >> outf, aid, p1, sj, sc, tf outf.close() print >> sys.stderr, 'Validation set...' validation = get_valid() outf = open('nvenue.valid.dat', 'w') for aid in validation: for p1 in validation[aid]: try: sj = aid2journal[aid][journal[p1]] except KeyError: sj = aid2journal[aid][-1] try: sc = aid2conference[aid][conference[p1]] except KeyError: sc = aid2conference[aid][-1] print >> outf, aid, p1, sj, sc outf.close()
import sys import gc from copy import deepcopy from random import choice from numpy import log, mean, std from scipy.misc import factorial from scipy.sparse import coo_matrix import scipy.sparse.linalg as sp from common import get_author_papers, get_train, get_valid if __name__ == '__main__': print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() confirmed, deleted = get_train() print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() outf = open('nauthors.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, len(authors[p1]), tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid()