def get_nauthors(): print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() confirmed, deleted = get_train() print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() outf = open('nauthors.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, len(authors[p1]), tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid() outf = open('nauthors.valid.dat', 'w') for aid in validation: allPapers = validation[aid] for p1 in allPapers: print >> outf, aid, p1, len(authors[p1]) outf.close()
def get_sum_coauthors(): print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() #Create the number of collaborations between 2 authors num_collaborations = {} for author in authors.values(): for i in range(0,len(author)): for j in range(i+1,len(author)): #Always use the lowest author id as the first key if author[i]<author[j]: try: num_collaborations[(author[i], author[j])] += 1 except KeyError: num_collaborations[(author[i], author[j])] = 1 else: try: num_collaborations[(author[j], author[i])] += 1 except KeyError: num_collaborations[(author[j], author[i])] = 1 print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() print confirmed outf = open('sumcoauthors.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: sum_coauthors = 0 for author in authors[p1]: if author<aid: sum_coauthors += num_collaborations[(author,aid)] elif aid<author: sum_coauthors += num_collaborations[(aid,author)] if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, sum_coauthors, tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid() outf = open('sumcoauthors.valid.dat', 'w') for aid in validation: allPapers = validation[aid] for p1 in allPapers: sum_coauthors = 0 for author in authors[p1]: if author<aid: sum_coauthors += num_collaborations[(author,aid)] elif aid<author: sum_coauthors += num_collaborations[(aid,author)] print >> outf, aid, p1, sum_coauthors outf.close()
def get_nvenue(): print >> sys.stderr, 'Reading venue info...' journal, conference = get_venue() print >> sys.stderr, 'Reading author-paper info...' papers, authors = get_author_papers() print >> sys.stderr, 'Counting papers in journals...' aid2journal = build_author_venue_count(papers, journal) print >> sys.stderr, 'Counting papers in conferences...' aid2conference = build_author_venue_count(papers, conference) print >> sys.stderr, 'Training set...' confirmed, deleted = get_train() outf = open('nvenue.train.dat', 'w') for aid in confirmed: for p1 in [p for p in confirmed[aid] + deleted[aid]]: if p1 in confirmed[aid]: tf = 'T' elif p1 in deleted[aid]: tf = 'F' else: raise WhatTheFuck try: sj = aid2journal[aid][journal[p1]] except KeyError: sj = aid2journal[aid][-1] try: sc = aid2conference[aid][conference[p1]] except KeyError: sc = aid2conference[aid][-1] print >> outf, aid, p1, sj, sc, tf outf.close() print >> sys.stderr, 'Validation set...' validation = get_valid() outf = open('nvenue.valid.dat', 'w') for aid in validation: for p1 in validation[aid]: try: sj = aid2journal[aid][journal[p1]] except KeyError: sj = aid2journal[aid][-1] try: sc = aid2conference[aid][conference[p1]] except KeyError: sc = aid2conference[aid][-1] print >> outf, aid, p1, sj, sc outf.close()
def get_affiliation_score(): print >> sys.stderr, 'Loading author reference affiliations...' affil_auth = get_affsauthors() print >> sys.stderr, 'Loading author-paper table...' affil_paper = get_affspapers() print >> sys.stderr, 'Creating training...' confirmed, deleted = get_train() outf = open('affiliation.train.dat', 'w') count, tot = 0, len(confirmed) for aid in confirmed: count += 1 print >> sys.stderr, '%d / %d' % (count, tot) all = confirmed[aid] + deleted[aid] for pid in confirmed[aid] + deleted[aid]: if pid in confirmed[aid]: tf = 'T' else: tf = 'F' scorea= affil_align_score(affil_paper[aid][pid], affil_auth[aid]) ## scorep = mean( ## [affil_align_score(affil_paper[aid][pid], ## affil_paper[aid][pid2]) ## for pid2 in all if pid2 != pid] ## ) ## scorem = max(scorea,scorep) ## print >> outf, aid, pid, scorea, scorep, scorem, tf print >> outf, aid, pid, scorea, tf outf.close() print >> sys.stderr, 'Creating validation...' validation = get_valid() outf = open('affiliation.valid.dat', 'w') for aid in validation: for pid in validation[aid]: scorea= affil_align_score(affil_paper[aid][pid], affil_auth[aid]) ## scorep = mean( ## [affil_align_score(affil_paper[aid][pid], ## affil_paper[aid][pid2]) ## for pid2 in validation[aid] if pid2 != pid] ## ) ## scorem = max(scorea,scorep) print >> outf, aid, pid, scorea outf.close()
def compute_venue_score(): venue = get_venue() confirmed, deleted = get_train() P1All, P2All, norm1All, norm2All = get_ps(confirmed, venue) count, tot = 0, len(confirmed) outf = open('venue.train.dat', 'w') for aid in confirmed: count += 1 print >> sys.stderr, '%d / %d' % (count, tot) P1, P2 = ps_without_author( aid, confirmed, venue, P1All, P2All, norm1All, norm2All ) all = confirmed[aid] + deleted[aid] aidConfirmed = confirmed[aid] confirmed[aid] = [] confirmed[aid] = aidConfirmed for p1 in all: if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' others = [p for p in all if p != p1] s = get_score(p1, others, venue, P1, P2) if s > -.5 : print >> outf, aid, p1, s, tf outf.close() validation = get_valid() count, tot = 0, len(validation) outf = open('venue.valid.dat', 'w') for aid in validation: count += 1 print >> sys.stderr, '%d / %d' % (count, tot) all = validation[aid] for p1 in all: others = [p for p in all if p != p1] s = get_score(p1, others, venue, P1All, P2All) if s > -.5 : print >> outf, aid, p1, s outf.close()
def get_name_score(): print >> sys.stderr, 'Loading author-paper table...' paperName = get_paper_name() print >> sys.stderr, 'Loading author reference names...' baseName = get_base_name() confirmed, deleted = get_train() outf = open('name.train.dat', 'w') for aid in confirmed: all = confirmed[aid] + deleted[aid] for pid in confirmed[aid]: try: sFull, sInit = name_align_score(paperName[aid][pid], baseName[aid]) print >> outf, aid, pid, sFull, sInit, 'T' except KeyError: pass for pid in deleted[aid]: try: sFull, sInit = name_align_score(paperName[aid][pid], baseName[aid]) print >> outf, aid, pid, sFull, sInit, 'F' except KeyError: pass outf.close() validation = get_valid() outf = open('name.valid.dat', 'w') for aid in validation: all = validation[aid] for pid in all: try: sFull, sInit = name_align_score(paperName[aid][pid], baseName[aid]) print >> outf, aid, pid, sFull, sInit except KeyError: pass outf.close()
def get_year_score(): print >> sys.stderr, 'Loading publication years...' year = get_year() print >> sys.stderr, 'Creating training...' confirmed, deleted = get_train() outf = open('year.train.dat', 'w') count, tot = 0, len(confirmed) for aid in confirmed: count += 1 print >> sys.stderr, '%d / %d' % (count, tot) all = confirmed[aid] + deleted[aid] paperYears = [year[p] for p in all] for pid in all: if pid in confirmed[aid]: tf = 'T' else: tf = 'F' score = (year[pid] - mean(paperYears)) / std(paperYears) if isnan(score): score = -100 print >> outf, aid, pid, score, tf outf.close() print >> sys.stderr, 'Creating validation...' validation = get_valid() outf = open('year.valid.dat', 'w') for aid in validation: all = validation[aid] paperYears = [year[p] for p in all] for pid in all: score = (year[pid] - mean(paperYears)) / std(paperYears) if isnan(score): score = -100 print >> outf, aid, pid, score outf.close()
def compute_title_score(): confirmed, deleted = get_train() title = get_titles() quick = {} for aid in confirmed: all = confirmed[aid] + deleted[aid] for pid in [p for p in confirmed[aid] if title.has_key(p)]: score = mean( [title_align_score(title[pid], title[pid2]) for pid2 in all if pid2 != pid and title.has_key(pid2)] ) print >> sys.stderr, aid, pid, score, 'T' for pid in [p for p in deleted[aid] if title.has_key(p)]: score = mean( [title_align_score(title[pid], title[pid2]) for pid2 in all if pid2 != pid and title.has_key(pid2)] ) print >> sys.stderr, aid, pid, score, 'F'
import sys import gc from copy import deepcopy from random import choice from numpy import log, mean, std from scipy.misc import factorial from scipy.sparse import coo_matrix import scipy.sparse.linalg as sp from common import get_author_papers, get_train, get_valid if __name__ == '__main__': print >> sys.stderr, 'Reading data...' papers, authors = get_author_papers() confirmed, deleted = get_train() print >> sys.stderr, 'Calculating scores for train set...' confirmed, deleted = get_train() outf = open('nauthors.train.dat', 'w') for aid in confirmed: allPapers = confirmed[aid] + deleted[aid] for p1 in allPapers: if p1 in confirmed[aid]: tf = 'T' else: tf = 'F' print >> outf, aid, p1, len(authors[p1]), tf outf.close() print >> sys.stderr, 'Calculating scores for validation set...' validation = get_valid()
def get_kw_score(start=None, nauthors=100): kws = get_kws() print >> sys.stderr, 'Training set...' confirmed, deleted = get_train() aids = confirmed.keys()[:] naids = len(aids) if start != None: start=int(start) nauthors=int(nauthors) aids = aids[ min(naids, nauthors * (start - 1)) : min(naids, nauthors * start) ] outFileName = 'keywords.train_%d-%d.dat' % ( min(naids, nauthors * (start - 1)), min(naids, nauthors * start) ) else: outFileName = 'keywords.train.dat' count, tot = 0, len(aids) outf = open(outFileName, 'w') for aid in aids: count += 1 print >> sys.stderr, '%d / %d' % (count, tot) all = confirmed[aid] + deleted[aid] for pid in all: if pid in confirmed[aid]: tf = 'T' else: tf = 'F' scoren = len(kws[pid]) scorea = mean( [kw_align_score(kws[pid], kws[pid2]) for pid2 in all if pid2 != pid] ) print >> outf, aid, pid, scorea, scoren, tf outf.close() print >> sys.stderr, 'Valid set...' validation = get_valid() aids = validation.keys()[:] naids = len(aids) if start != None: aids = aids[ min(naids, nauthors * (start - 1)) : min(naids, nauthors * start) ] outFileName = 'keywords.valid_%d-%d.dat' % ( min(naids, nauthors * (start - 1)), min(naids, nauthors * start) ) else: outFileName = 'keywords.valid.dat' count, tot = 0, len(aids) outf = open(outFileName, 'w') for aid in aids: all = validation[aid] for pid in all: scoren = len(kws[pid]) scorea = mean( [kw_align_score(kws[pid], kws[pid2]) for pid2 in all if pid2 != pid] ) print >> outf, aid, pid, scorea, scoren outf.close()