def load_contexts(): db_csx = MyMySQL(db="csx", user="******", passwd="") db_cg = MyMySQL(db="csx_citegraph", user="******", passwd="") pubs = get_pubs(db_csx) print "%d publications loaded." % len(pubs) clusters = {str(pub_id): cluster for pub_id, cluster in pubs} citations = get_citations(db_csx) print "%d citations loaded." % len(citations) found = 0 for n, (citing, cited) in enumerate(citations): cciting = clusters[str(citing)] ccited = clusters[str(cited)] context = get_context(db_cg, cciting, ccited) if context is None: context = '' else: context = context.replace("'", '"') found += 1 # if (context is not None) and (context != ""): try: update_graph(db_csx, citing, cited, context) except: print "Exception when updating 'graph' table." print "%d out of %d contexts found." % (found, n + 1)
def __init__(self, ec2_manager, ec2_instance_id, ec2_instance_dns): ''' Since this is run on the main process, it shouldn't open connection or file descriptors. ''' # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # EC2 manager to issue commands. self.ec2_manager = ec2_manager # EC2 instance information to be used as a proxy. self.ec2_instance_id = ec2_instance_id self.ec2_instance_dns = ec2_instance_dns # Logging configuration self.log = utils.config_logging( 'downloader', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
def write_surveys_queries_file(prefix, npubs=110) : db = MyMySQL(db=config.DB_NAME) candidates = db.select_query('''SELECT id, substring(title,1,140), year FROM papers WHERE title LIKE '%survey%' AND (year IS NOT NULL) AND (year BETWEEN 1950 AND 2014)''') print "Candidates: %s" % len(candidates) # Include the word 'survey' for this particular case _stop_words_.add("survey") # Write candidates to file file = open(prefix + ".txt", "w") n = 0 for pub_id, title, year in candidates : citations = utils.get_cited(db, pub_id) if len(citations)>=20 : query = to_query(title) print >> file, "%s\t%d\t%s\t%s" % (pub_id, year, title.strip(), query) n += 1 if (n >= npubs) : break file.close()
def manual_queries_topic_graphs(from_dataset, to_dataset) : db = MyMySQL(db=to_dataset) pub_ids = set(db.select("id", table="papers")) from_folder = config.DATA + "query_sets/" + from_dataset + "/manual/" to_folder = config.DATA + "query_sets/" + to_dataset + "/manual/" for file_name in os.listdir(from_folder) : print file_name from_file = open(from_folder + file_name, 'r') to_file = open(to_folder + file_name, 'w') # Read and write back header line header = from_file.readline().strip('\n') # ignore header print >> to_file, header for line in from_file : relev, pub_id, title = line.strip().split('\t') if (pub_id not in pub_ids) : pub_id = '' print >> to_file, "%s\t%s\t%s" %(relev, pub_id, title) from_file.close() to_file.close()
def get_stats(dataset) : db = MyMySQL(db=dataset) kw_table = 'doc_ngrams' if (dataset=='aminer') else 'doc_kws' npubs = db.select_query("select count(*) from papers")[0][0] nauthors = db.select_query("select count(distinct author_id) from authorships")[0][0] nkws = db.select_query("select count(distinct ngram) from %s" % kw_table)[0][0] nvenues = db.select_query("select count(distinct venue_id) from papers")[0][0] pubs_pubs = db.select_query("select count(*) from graph")[0][0] auths_auths = db.select_query("select count(*) from coauthorships")[0][0] pubs_authors = db.select_query("select count(*) from authorships")[0][0] pubs_kws = db.select_query("select count(*) from %s where value>=%f" % (kw_table, config.MIN_NGRAM_TFIDF))[0][0] # npubs = 1 # nauthors = 2 # nkws = 3 # nvenues = 4 # pubs_pubs = 1 # auths_auths = 4 # pubs_authors = 2 # pubs_kws = 3 print "\\hline" print "\\multicolumn{4}{|c|}{%s} \\\\" % TEX_NAMES[dataset] print "\\hline" print "pubs ($N_p$) & %d & pubs-pubs & %d \\\\" % (npubs, pubs_pubs) print "authors & %d & authors-authors & %d \\\\" % (nauthors, auths_auths) print "keywords ($N_k$) & %d & pubs-keywords & %d \\\\" % (nkws, pubs_kws) print "venues ($N_v$) & %d & pubs-authors & %d \\\\" % (nvenues, pubs_authors)
def __init__(self): # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Logging configuration self.log = utils.config_logging( 'tokenizer', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") self.MIN_TOKENS = 10 # Create folders with non existing utils.ensure_folder(os.path.dirname(config.TOKENS_PATH)) utils.ensure_folder(os.path.dirname(config.TOKENS_PATH_PARTS))
def get_cited_papers(doc_id) : db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD) return db.select_query("""SELECT r.cited_paper_id, g.start, g.end FROM citations c JOIN citation_groups g ON c.group_id = g.id JOIN refs r ON c.ref_id=r.id WHERE c.paper_id='%s' AND r.cited_paper_id IS NOT NULL""" % doc_id)
def __init__(self, n=None): db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD) rows = db.select(fields=["id", "title", "abstract"], table="papers") if n : rows = random.sample(rows, n) self.pubs = {str(id): (title, abs) for id, title, abs in rows}
def main(argv): query = None usr = None output_file = None pwd = None n = 20 try: opts, _args_ = getopt.getopt(argv, "hq:o:n:u:p:") except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': sys.exit() elif opt == "-q": query = arg elif opt == "-o": output_file = arg elif opt == "-n": n = int(arg) elif opt == "-u": usr = arg elif opt == "-p": pwd = arg else: print "Invalid option: %s" % opt # Check mandatory arguments if (not query or not usr or not pwd): usage() sys.exit(2) s = searchers.Searcher(**config.PARAMS) pub_ids = s.search(query, limit=n) if not output_file: output_file = utils.get_graph_file_name(query) # Writes the graph structure as a gexf file nx.write_gexf(s.graph, output_file) # Prints the results db = MyMySQL(db='csx', user=usr, passwd=pwd) for id in pub_ids: print "%12s\t %s" % ( id, db.select_one("title", table="papers", where="id='%s'" % id))
def write_surveys_queries(n=110) : db = MyMySQL(db=config.DB_NAME) if not os.path.exists(config.QUERY_SETS_PATH) : os.mkdir(config.QUERY_SETS_PATH) prefix = config.QUERY_SETS_PATH + "surveys" # write_surveys_queries_file(prefix, n) write_query_set_folder(db, prefix)
def __init__(self): self.index = Index(config.INDEX_PATH) # Get citation counts and store into dict for fast lookup db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD) ncitations = db.select_query( "SELECT cited, COUNT(*) from graph GROUP BY cited") self.ncitations = dict(ncitations)
def write_citations_queries(name1, n1, name2, n2) : db = MyMySQL(db=config.DB_NAME) if not os.path.exists(config.QUERY_SETS_PATH) : os.mkdir(config.QUERY_SETS_PATH) path1 = config.QUERY_SETS_PATH + name1 path2 = config.QUERY_SETS_PATH + name2 # write_citations_query_set_files(db, path1, n1, path2, n2) write_query_set_folder(db, path1) write_query_set_folder(db, path2)
def keyword_centric(keyword, from_db, to_db): db = MyMySQL(db=from_db) pub_ids = db.select("paper_id", table="keywords", where="kw='%s'" % keyword) nodes = set() new_nodes = set() new_nodes.update(pub_ids) n = 50000 while len(nodes) < n: new_nodes = get_next_hop(new_nodes) nodes.update(new_nodes) print len(nodes) print "Adding %d nodes." % len(nodes) new_db = MyMySQL(db=to_db) # values = ','.join(['%s'%id for id in nodes]) new_db.insert(into="use_papers", fields=["paper_id"], values=list(nodes))
def check_ids(folder) : db = MyMySQL(db='csx') for i in xrange(1,8) : print i print with open(folder + str(i) + ".txt") as file : _header_ = file.readline() for line in file : relev, pub_id, title = line.strip().split('\t') if (len(db.select("id", table="papers", where="id='%s'"%pub_id)) == 0) : print "Pub not found:", pub_id
def find_ids_unsupervised(titles, index_folder): db = MyMySQL(db='csx') index = Index(index_folder) found = 0 doc_ids = [] for title in titles: top_docs, scores = index.search(title, search_fields=["title"], return_fields=["id"], return_scores=True, limit=5) # ids = index.get_documents(top_docs, fields="id") # To decide if the most similar title in the index is a hit we check if its score # is significantly higher than those of the hits that follow it (second to sixth) if len(scores) > 2 and (scores[0] > 2 * np.mean(scores[1:])): doc_ids.append(top_docs[0][0]) found += 1 else: doc_ids.append("") # Only enable for debugging and finding a threshold if 0: print "-------" print "%s" % (title) print "-------" for i, (id, ) in enumerate(top_docs): title = db.select_one("title", table="papers", where="id='%s'" % id) print "%.2f\t%s" % (scores[i], title.encode("UTF-8")) if (scores[0] > 2 * np.mean(scores[1:])): print "Found!", op = '>' else: print "Not found!", op = '<' print "(%.2f %s %.2f)\n" % (scores[0], op, 2 * np.mean(scores[1:])) return doc_ids
def get_citing_papers(doc_id) : db = MyMySQL(db=DB_NAME, user=DB_USER, passwd=DB_PASSWD) query = """SELECT r.paper_id, cg.start, cg.end FROM refs r JOIN citations c ON r.id=c.ref_id JOIN citation_groups cg ON c.group_id=cg.id WHERE cited_paper_id='%s' """ % doc_id rows = db.select_query(query) # Group citations by paper citations = defaultdict(list) for citing_paper, start, end in rows : citations[citing_paper].append((start, end)) return citations
def time_diversity(names, query_set) : # Get year of each paper for assembling personalization array next db = MyMySQL(db=config.DATASET) rows = db.select(["id", "year"], table="papers", where="year is not NULL and year between 1950 and 2013") years = {pub_id: year for pub_id, year in rows} for name in names : file_path = "%s/results/%s/%s/%s.p" % (config.DATA, config.DATASET, query_set, name) returned_years = [] results = cPickle.load(open(file_path, 'r')) for _correct, _relevances, returned in results : for r in returned : if r in years : returned_years.append(years[r]) print "%s\t%.2f\t%.2f" % (name, np.mean(returned_years), np.std(returned_years))
def fix_contexts_limits() : """ Updates the contexts on the graph table so that the tokens on the extremities are removed. These are usually parts of words, and therefore are meaningless. """ db = MyMySQL(db="csx", user="******", passwd="") ctxs = db.select(["citing", "cited", "context"], table="graph", where="context != ''") print len(ctxs) for citing, cited, ctx in progress(ctxs): s = ctx.find(" ") e = ctx.rfind(" ") # print ctx # print ctx[s+1:e] # print db.update(table="graph", set="context='%s'" % ctx[s+1:e], where="(citing='%s') AND (cited='%s')" % (citing, cited))
def get_layer_results(queries, searcher, folder, layer) : db = MyMySQL(db=config.DATASET) def get_pub(pub_id) : return db.select_one("title", table="papers", where="id='%s'" % pub_id) def get_author(author_id) : return db.select_one("name", table="authors", where="cluster=%s" % author_id) def get_venue(venue_id): abbrev, name = db.select_one(["abbrev", "name"], table="venues", where="id=%s" % venue_id) return " ".join((abbrev, name)).strip() def get_keyword(kw): return kw # Create the folder that will hold the results for this layer if not os.path.exists(folder) : os.makedirs(folder) print "\n%s" % folder # Each layer has a different handler to get the name of the entity get_entities = {'paper': get_pub, 'author': get_author, 'venue': get_venue, 'ngram': get_keyword} # Now fetch the results and save them for query in queries : file_path = os.path.join(folder, query.replace(' ', '+') + ".txt") print " ", query entity_ids = searcher.search(query, rtype=layer, limit=50) with open(file_path, 'w') as file : for eid in entity_ids: name = get_entities[layer](eid).strip() print >> file, "%s" % (name.encode("UTF-8"))
def __init__(self): ''' Converter constructor.''' # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Logging configuration self.log = utils.config_logging( 'converter', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
def get_texts(pub_ids, use_title=True, use_abs=True) : ''' This is a non-batch version. Much slower but more memory efficient. ''' db = MyMySQL(db='csx', user='******', passwd='') fields = [] if use_title: fields.append("title") if use_abs: fields.append("abstract") texts = [] for pub_id in pub_ids: text_fields = db.select_one(fields=fields, table="papers", where="id='%s'" % pub_id) text = '' for tf in text_fields: if tf is not None: text += tf texts.append(text) return texts
def search(self, query, exclude=[], limit=20, force=False): # import warnings # warnings.filterwarnings('error') file_path = config.CITERANK_FILE_PATH if not os.path.exists(file_path): g = nx.DiGraph() g.add_edges_from(model.get_all_edges()) # Remove documents from the exclude list g.remove_nodes_from(exclude) # Get year of each paper for assembling personalization array next db = MyMySQL(db=config.DATASET) rows = db.select(["id", "year"], table="papers") years = {} for pub_id, year in rows: if year is not None: years[pub_id] = year # Calculate the median to use in the missing values year_median = np.median(years.values()) # Create a personalization array by exponentially decaying # each paper's factor by its age pers = {} for node in g.nodes(): if (node not in years) or (years[node] < 1960) or (years[node] > 2013): years[node] = year_median pers[node] = np.exp(float(years[node] - 2013) / self.tau) # try : # except Warning: # print "Warning!" # print node # print year # print print "Running PageRank with %d nodes and age defined personalization vector." % g.number_of_nodes( ) r = nx.pagerank(g, personalization=pers) print "Writing results" cPickle.dump(r, open(file_path, "w")) # Loads cached page rank values for every node r = cPickle.load(open(file_path, "r")) # Sorts documents decreasingly by page rank value ids, _score_ = zip( *sorted(r.items(), key=lambda (k, v): v, reverse=True)) # Fetches all document that have at least one of the terms. # Store them in a set for fast lookup pub_ids = self.index.search(query, search_fields=["title", "abstract"], return_fields=["id"], ignore=exclude) pub_ids = set([pid for (pid, ) in pub_ids]) results = [] for id in ids: if id in pub_ids: results.append(id) if len(results) == limit: break return results
''' Created on Jun 29, 2015 @author: luamct ''' from mymysql.mymysql import MyMySQL import random import networkx as nx from utils import progress db = MyMySQL(db='csx') def get_cited(pub_id): return db.select("cited", table="graph", where="citing='%s'" % pub_id) def get_citing(pub_id): return db.select("citing", table="graph", where="cited='%s'" % pub_id) def get_neighbours(pub_id): return db.select(["citing", "cited"], table="graph", where="citing='%s' OR cited='%s'" % (pub_id, pub_id)) def depth_walk(): ids = db.select("id", table="papers", limit=10000)
from contexts import contexts from collections import Counter, defaultdict import nltk import utils from utils import tokenize import logging import sys from config import DATA, TOKENS_PATH, DB_NAME from sklearn.feature_extraction.text import TfidfVectorizer stopwords = set(nltk.corpus.stopwords.words('english')) NUMBER = re.compile("\d+$") VARIABLE = re.compile("\w\d$") db = MyMySQL(db=DB_NAME) def filter_tokens(tokens): ''' Filter some tokens before the analysis (infrequent, numbers, variables names). ''' valid = lambda (token, _freq) : \ not (token in stopwords) and \ (len(token) >= 3) and \ not re.match(NUMBER, token) return filter(valid, tokens) def read_line(line):
import lxml.html from pylucene import DocField, Index from mymysql.mymysql import MyMySQL import random from utils import progress import config import os from random import Random #URL_TEMPLATE = "http://www.informatik.uni-trier.de/~ley/db/conf/index-%s.html" #URL_TEMPLATE = "http://dblp.uni-trier.de/db/hc/conf/index-%s.html" URL_TEMPLATE = "http://dblp.uni-trier.de/db/%s" IGNORE_TERMS = ["proceedings", "proc."] db = MyMySQL(db='aminer') def download_venues(venue_type): ''' Venue types available: ['conf', 'journals']. ''' folder = config.DATA + ("venues/html/%s/" % venue_type) url = URL_TEMPLATE % venue_type pos = 1 while (True): print "Processing %d-%d" % (pos, pos + 99)
Created on May 26, 2015 @author: luamct ''' from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model.logistic import LogisticRegression from sklearn.preprocessing.label import MultiLabelBinarizer from mymysql.mymysql import MyMySQL from collections import Counter, defaultdict import numpy as np import random import nltk from utils import PubTexts db = MyMySQL(db='csx', user='******', passwd='') #rows = db.select(fields=["id", "title", "abstract"], table="papers") #pubs = {str(id): (title, abs) for id, title, abs in rows} #pubs = {str(id): (title + ' ' + abs) for id, title, abs in rows} MAX_KWS = 10 def get_keywords(min=1): kws = db.select(fields=("id", "kw"), table=("papers", "keywords"), join_on=("id", "paper_id")) count = Counter([kw for _pid, kw in kws])
@author: hugo ''' import os import sys import time import numpy as np import config from collections import defaultdict from mymysql.mymysql import MyMySQL from datasets.mag import get_selected_pubs from ranking.kddcup_searchers import SimpleSearcher, Searcher from evaluation.kddcup_expt import get_results_file, save_results db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD) def rank_affils(selected_affils, conf_name, year, searcher, show=True, results_file=None): conf_id = db.select("id", "confs", where="abbr_name='%s'" % conf_name, limit=1)[0] start = time.time() if searcher.name() == "SimpleSearcher":
''' Created on Jun 9, 2015 @author: luamct ''' from collections import defaultdict from mymysql.mymysql import MyMySQL from utils import progress, plot import sys import config from pylucene import Index, DocField IGNORE_TERMS = ["proceedings", "proc."] db = MyMySQL(db='aminer', user='******', passwd='') def load_existing_venues(): rows = db.select(fields=["id", "name"], table="venues") return {name: int(id) for id, name in rows} def save_venue(id, name): db.insert(into="venues", fields=["id", "name"], values=[id, name]) def save_citations(id, cits): values = [(id, cid) for cid in cits] db.insert(into="graph", fields=["citing", "cited"], values=values)