ctx = JobContext("Test Filename") parser = register_parser.parser_dict['arxiv.org'] sr = SearchResult(None, "http://arxiv.org/abs/1312.6680") #sr = SearchResult(None, " http://arxiv.org/abs/1404.3610") #parser = register_parser.parser_dict['dl.acm.org'] #url = "http://dl.acm.org/citation.cfm?id=1859761" # twitter #url = "http://dl.acm.org/citation.cfm?id=996342" # SIFT # Large Number of cited #url = "http://dl.acm.org/citation.cfm?id=2366157" # big #url = "http://dl.acm.org/citation.cfm?id=1656278" # Weka #sr = SearchResult(None, url) #parser = register_parser.parser_dict['ieeexplore.ieee.org'] #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791") #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529") #parser = register_parser.parser_dict['sciencedirect.com'] #url = "http://www.sciencedirect.com/science/article/pii/S1570870513000073" #sr = SearchResult(None, url) #params = parser.fetch_info(ctx, sr) #print params data = parser.download(sr) print ctx.title if ukconfig.USE_DB and ctx.success: pid = new_paper(ctx)
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format( len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{ '_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")
if len(sys.argv) == 2: ukconfig.USE_DB = True ctx = JobContext("Test Filename") parser = register_parser.parser_dict['arxiv.org'] sr = SearchResult(None, "http://arxiv.org/abs/1312.6680") #sr = SearchResult(None, " http://arxiv.org/abs/1404.3610") #parser = register_parser.parser_dict['dl.acm.org'] #url = "http://dl.acm.org/citation.cfm?id=1859761" # twitter #url = "http://dl.acm.org/citation.cfm?id=996342" # SIFT # Large Number of cited #url = "http://dl.acm.org/citation.cfm?id=2366157" # big #url = "http://dl.acm.org/citation.cfm?id=1656278" # Weka #sr = SearchResult(None, url) #parser = register_parser.parser_dict['ieeexplore.ieee.org'] #sr = SearchResult(None, "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=00726791") #sr = SearchResult(None, "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4244529") #parser = register_parser.parser_dict['sciencedirect.com'] #url = "http://www.sciencedirect.com/science/article/pii/S1570870513000073" #sr = SearchResult(None, url) #params = parser.fetch_info(ctx, sr) #print params data = parser.download(sr) print ctx.title if ukconfig.USE_DB and ctx.success: pid = new_paper(ctx)
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format(len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{'_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")