def manual_queries_topic_graphs(from_dataset, to_dataset) : db = MyMySQL(db=to_dataset) pub_ids = set(db.select("id", table="papers")) from_folder = config.DATA + "query_sets/" + from_dataset + "/manual/" to_folder = config.DATA + "query_sets/" + to_dataset + "/manual/" for file_name in os.listdir(from_folder) : print file_name from_file = open(from_folder + file_name, 'r') to_file = open(to_folder + file_name, 'w') # Read and write back header line header = from_file.readline().strip('\n') # ignore header print >> to_file, header for line in from_file : relev, pub_id, title = line.strip().split('\t') if (pub_id not in pub_ids) : pub_id = '' print >> to_file, "%s\t%s\t%s" %(relev, pub_id, title) from_file.close() to_file.close()
def __init__(self, n=None): db = MyMySQL(db=config.DB_NAME, user=config.DB_USER, passwd=config.DB_PASSWD) rows = db.select(fields=["id", "title", "abstract"], table="papers") if n : rows = random.sample(rows, n) self.pubs = {str(id): (title, abs) for id, title, abs in rows}
def check_ids(folder) : db = MyMySQL(db='csx') for i in xrange(1,8) : print i print with open(folder + str(i) + ".txt") as file : _header_ = file.readline() for line in file : relev, pub_id, title = line.strip().split('\t') if (len(db.select("id", table="papers", where="id='%s'"%pub_id)) == 0) : print "Pub not found:", pub_id
def time_diversity(names, query_set) : # Get year of each paper for assembling personalization array next db = MyMySQL(db=config.DATASET) rows = db.select(["id", "year"], table="papers", where="year is not NULL and year between 1950 and 2013") years = {pub_id: year for pub_id, year in rows} for name in names : file_path = "%s/results/%s/%s/%s.p" % (config.DATA, config.DATASET, query_set, name) returned_years = [] results = cPickle.load(open(file_path, 'r')) for _correct, _relevances, returned in results : for r in returned : if r in years : returned_years.append(years[r]) print "%s\t%.2f\t%.2f" % (name, np.mean(returned_years), np.std(returned_years))
def fix_contexts_limits() : """ Updates the contexts on the graph table so that the tokens on the extremities are removed. These are usually parts of words, and therefore are meaningless. """ db = MyMySQL(db="csx", user="******", passwd="") ctxs = db.select(["citing", "cited", "context"], table="graph", where="context != ''") print len(ctxs) for citing, cited, ctx in progress(ctxs): s = ctx.find(" ") e = ctx.rfind(" ") # print ctx # print ctx[s+1:e] # print db.update(table="graph", set="context='%s'" % ctx[s+1:e], where="(citing='%s') AND (cited='%s')" % (citing, cited))
def keyword_centric(keyword, from_db, to_db): db = MyMySQL(db=from_db) pub_ids = db.select("paper_id", table="keywords", where="kw='%s'" % keyword) nodes = set() new_nodes = set() new_nodes.update(pub_ids) n = 50000 while len(nodes) < n: new_nodes = get_next_hop(new_nodes) nodes.update(new_nodes) print len(nodes) print "Adding %d nodes." % len(nodes) new_db = MyMySQL(db=to_db) # values = ','.join(['%s'%id for id in nodes]) new_db.insert(into="use_papers", fields=["paper_id"], values=list(nodes))
def search(self, query, exclude=[], limit=20, force=False): # import warnings # warnings.filterwarnings('error') file_path = config.CITERANK_FILE_PATH if not os.path.exists(file_path): g = nx.DiGraph() g.add_edges_from(model.get_all_edges()) # Remove documents from the exclude list g.remove_nodes_from(exclude) # Get year of each paper for assembling personalization array next db = MyMySQL(db=config.DATASET) rows = db.select(["id", "year"], table="papers") years = {} for pub_id, year in rows: if year is not None: years[pub_id] = year # Calculate the median to use in the missing values year_median = np.median(years.values()) # Create a personalization array by exponentially decaying # each paper's factor by its age pers = {} for node in g.nodes(): if (node not in years) or (years[node] < 1960) or (years[node] > 2013): years[node] = year_median pers[node] = np.exp(float(years[node] - 2013) / self.tau) # try : # except Warning: # print "Warning!" # print node # print year # print print "Running PageRank with %d nodes and age defined personalization vector." % g.number_of_nodes( ) r = nx.pagerank(g, personalization=pers) print "Writing results" cPickle.dump(r, open(file_path, "w")) # Loads cached page rank values for every node r = cPickle.load(open(file_path, "r")) # Sorts documents decreasingly by page rank value ids, _score_ = zip( *sorted(r.items(), key=lambda (k, v): v, reverse=True)) # Fetches all document that have at least one of the terms. # Store them in a set for fast lookup pub_ids = self.index.search(query, search_fields=["title", "abstract"], return_fields=["id"], ignore=exclude) pub_ids = set([pid for (pid, ) in pub_ids]) results = [] for id in ids: if id in pub_ids: results.append(id) if len(results) == limit: break return results
class Downloader(): def __init__(self): ''' Stores the process id and creates a task manager to get and update tasks. ''' # Zeno task manager self.tasks = zeno.TasksManager("tasks", host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Database connection self.db = MyMySQL(db=config.DB_NAME, host=config.DB_HOST, user=config.DB_USER, passwd=config.DB_PASSWD) # Logging configuration self.log = utils.config_logging( 'downloader', stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s (%(name)s) [%(levelname)6s]: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") def parse_error(self, content): ''' Parsers the returned response's HTML and throws the appropriate exception. ''' if content.find("Download Limit Exceeded"): raise LimitReachedException() else: raise Exception() def make_csx_url(self, id): return "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % id def download_from_csx(self, paper_id): ''' Downloads the given image URL. ''' # Get url from the database url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=%s&rep=rep1&type=pdf" % paper_id headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'} response = requests.get(url, headers=headers) if (response.status_code != 200): raise RequestException("%d: %s" % (response.status_code, response.reason)) if response.headers['Content-Type'].startswith('text/html'): self.parse_error(response.content) # Save file to the local disk file_path = os.path.join(self.data_folder, "%s.pdf" % paper_id) img_file = open(file_path, "wb") img_file.write(response.content) img_file.close() def get_all_urls(self, paper_id): ''' Returns the external paper URL if available. ''' cluster_id = self.db.select_one("cluster", table="papers", where="id='%s'" % paper_id) alt_paper_ids = self.db.select("id", table="papers", where="cluster=%d" % cluster_id) urls = [] for altern_id in alt_paper_ids: urls = urls + [self.make_csx_url(altern_id)] other_urls = self.db.select("url", table="urls", where="paperid='%s'" % altern_id) urls = other_urls + urls return urls def download(self, paper_id): ''' Downloads the given image URL. ''' headers = {'User-Agent': 'Chrome/34.0.1847.116 (X11; Linux x86_64)'} # Get url from the database urls = self.get_all_urls(paper_id) for url in urls: # Only supports PDF for now if url[-3:].lower() != "pdf": continue try: response = requests.get(url, headers=headers) except ConnectionError: self.log.warn("Connection error! Ignoring URL '%s'" % (url)) continue response_type = response.headers['Content-Type'] if response_type.startswith('text/html'): if response.content.find("Download Limit Exceeded") >= 0: raise LimitReachedException() else: continue if (response.status_code != 200) or (response_type != "application/pdf"): continue # raise MissingURLException() # if (response.status_code != 200) : # raise RequestException("%d: %s" % (response.status_code, response.reason)) # Save file to the local disk file_path = config.PDF_PATH % paper_id img_file = open(file_path, "wb") img_file.write(response.content) img_file.close() # Download successfully completed return True # If we got here, no valid URL was found return False def run(self): self.log.info("Starting %s." % os.getpid()) # Keep running until a stop file is found while (not os.path.exists("stop")): try: paper_id = self.tasks.get_next("START") if not self.download(paper_id): raise DownloadException("Could not download paper '%s'." % paper_id) # Update the task status and the disk in which the file was saved. self.tasks.update_success(paper_id, "DOWNLOADED") # Everything went OK if got here self.log.info("%s: OK" % paper_id) # Nothing to collect except NothingToProcessException: self.log.error("Nothing to process.") break except LimitReachedException: self.log.error("Request limit reached!! Waiting...") self.tasks.update_release( paper_id, "Request limit reached. Will try again later.") time.sleep(60 * 60) # URL missing in the DB or not returning the resource. except DownloadException, e: self.log.error("%s: FAIL" % (paper_id)) self.tasks.update_error(paper_id, message=str(e)) # Request errors # except RequestException, e: # self.log("%s: %s\n%s" % (paper_id, e.msg, traceback.format_exc()), show=True) # self.db.update_status(paper_id, DBManager.DOWNLOAD_ERROR) # Any other exception we log the traceback, update the DB and life goes on except Exception, e: self.log.error("%s: FAIL: %s" % (paper_id, traceback.format_exc())) self.tasks.update_error(paper_id, message=str(e))