def get_stats(dataset) : db = MyMySQL(db=dataset) kw_table = 'doc_ngrams' if (dataset=='aminer') else 'doc_kws' npubs = db.select_query("select count(*) from papers")[0][0] nauthors = db.select_query("select count(distinct author_id) from authorships")[0][0] nkws = db.select_query("select count(distinct ngram) from %s" % kw_table)[0][0] nvenues = db.select_query("select count(distinct venue_id) from papers")[0][0] pubs_pubs = db.select_query("select count(*) from graph")[0][0] auths_auths = db.select_query("select count(*) from coauthorships")[0][0] pubs_authors = db.select_query("select count(*) from authorships")[0][0] pubs_kws = db.select_query("select count(*) from %s where value>=%f" % (kw_table, config.MIN_NGRAM_TFIDF))[0][0] # npubs = 1 # nauthors = 2 # nkws = 3 # nvenues = 4 # pubs_pubs = 1 # auths_auths = 4 # pubs_authors = 2 # pubs_kws = 3 print "\\hline" print "\\multicolumn{4}{|c|}{%s} \\\\" % TEX_NAMES[dataset] print "\\hline" print "pubs ($N_p$) & %d & pubs-pubs & %d \\\\" % (npubs, pubs_pubs) print "authors & %d & authors-authors & %d \\\\" % (nauthors, auths_auths) print "keywords ($N_k$) & %d & pubs-keywords & %d \\\\" % (nkws, pubs_kws) print "venues ($N_v$) & %d & pubs-authors & %d \\\\" % (nvenues, pubs_authors)
def write_surveys_queries_file(prefix, npubs=110) : db = MyMySQL(db=config.DB_NAME) candidates = db.select_query('''SELECT id, substring(title,1,140), year FROM papers WHERE title LIKE '%survey%' AND (year IS NOT NULL) AND (year BETWEEN 1950 AND 2014)''') print "Candidates: %s" % len(candidates) # Include the word 'survey' for this particular case _stop_words_.add("survey") # Write candidates to file file = open(prefix + ".txt", "w") n = 0 for pub_id, title, year in candidates : citations = utils.get_cited(db, pub_id) if len(citations)>=20 : query = to_query(title) print >> file, "%s\t%d\t%s\t%s" % (pub_id, year, title.strip(), query) n += 1 if (n >= npubs) : break file.close()
def get_cited_papers(doc_id) : db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD) return db.select_query("""SELECT r.cited_paper_id, g.start, g.end FROM citations c JOIN citation_groups g ON c.group_id = g.id JOIN refs r ON c.ref_id=r.id WHERE c.paper_id='%s' AND r.cited_paper_id IS NOT NULL""" % doc_id)
def __init__(self): self.index = Index(config.INDEX_PATH) # Get citation counts and store into dict for fast lookup db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD) ncitations = db.select_query( "SELECT cited, COUNT(*) from graph GROUP BY cited") self.ncitations = dict(ncitations)
def get_citing_papers(doc_id) : db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD) query = """SELECT r.paper_id, cg.start, cg.end FROM refs r JOIN citations c ON r.id=c.ref_id JOIN citation_groups cg ON c.group_id=cg.id WHERE cited_paper_id='%s' """ % doc_id rows = db.select_query(query) # Group citations by paper citations = defaultdict(list) for citing_paper, start, end in rows : citations[citing_paper].append((start, end)) return citations