def main(): global args args = get_args() query = args.title directory = args.directory searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() download_candidates = [] if re.match('^http[s]?://', query): # skip search ctx = JobContext("") sr = SearchResult(None, query) for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) else: #query = "Distinctive image features from scale-invariant keypoint" ctx = JobContext(query) search_args = zip(searchers, [ctx] * len(searchers)) pool = Pool() as_results = [pool.apply_async(searcher_run, arg) for arg in search_args] #results = [searcher_run(*arg) for arg in search_args] # for debug for s in as_results: s = s.get() if s is None: continue ctx.update_meta_dict(s['ctx_update']) print s['ctx_update'] ctx.try_update_title_from_search_result(s) for sr in s['results']: for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) pool.terminate() download_candidates = sorted( download_candidates, key=lambda x: x[0].priority, reverse=True) for (parser, sr) in download_candidates: data = parser.download(sr) if not data: continue data = pdf_compress(data) if ctx.title: ctx.title = finalize_filename(ctx.title) else: log_info("Failed to guess paper title!") ctx.title = "Unnamed Paper {}".format(md5(data)) filename = os.path.join(directory, ctx.title + ".pdf") if os.path.exists(filename): log_err("File \"{}\" exists! overwrite? (y/n)".format(os.path.basename(filename))) resp = raw_input() if resp not in ['y', 'Y']: log_info("No file written. Exiting...") break with open(filename, 'wb') as f: f.write(data) if args.output: os.rename(filename, args.output) log_info("Successfully downloaded to {0}".format(filename)) break else: log_err("Failed to download {0}".format(ctx.title)) if ctx.meta.get('bibtex'): log_info("Bibtex:\n{}".format(ctx.meta['bibtex'])) if ctx.meta.get('author'): log_info("Author: {0}".format(ctx.meta['author'])) if ctx.meta.get('citecnt'): log_info("Cite count: {0}".format(ctx.meta['citecnt']))
import searcher from job import JobContext from searcher import searcher_run if __name__ == '__main__': query = sys.argv[1] searchers = searcher.register_searcher.get_searcher_list() searchers = searchers[1:] print[k.name for k in searchers] ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item for s in async_results: s = s.get() if s is None: continue srs = s['results'] print srs meta = s.get('ctx_update') if meta: ctx.update_meta_dict(meta) pool.close() pool.terminate()
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format( len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{ '_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")
def main(): global args args = get_args() query = args.title directory = args.directory searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() download_candidates = [] if re.match('^http[s]?://', query): # skip search ctx = JobContext("") sr = SearchResult(None, query) for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) else: #query = "Distinctive image features from scale-invariant keypoint" ctx = JobContext(query) search_args = zip(searchers, [ctx] * len(searchers)) pool = Pool() as_results = [ pool.apply_async(searcher_run, arg) for arg in search_args ] #results = [searcher_run(*arg) for arg in search_args] # for debug for s in as_results: s = s.get() if s is None: continue ctx.update_meta_dict(s['ctx_update']) print s['ctx_update'] ctx.try_update_title_from_search_result(s) for sr in s['results']: for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) pool.terminate() download_candidates = sorted(download_candidates, key=lambda x: x[0].priority, reverse=True) for (parser, sr) in download_candidates: data = parser.download(sr) if not data: continue data = pdf_compress(data) if ctx.title: ctx.title = finalize_filename(ctx.title) else: log_info("Failed to guess paper title!") ctx.title = "Unnamed Paper {}".format(md5(data)) filename = os.path.join(directory, ctx.title + ".pdf") if os.path.exists(filename): log_err("File \"{}\" exists! overwrite? (y/n)".format( os.path.basename(filename))) resp = raw_input() if resp not in ['y', 'Y']: log_info("No file written. Exiting...") break with open(filename, 'wb') as f: f.write(data) if args.output: os.rename(filename, args.output) log_info("Successfully downloaded to {0}".format(filename)) break else: log_err("Failed to download {0}".format(ctx.title)) if ctx.meta.get('bibtex'): log_info("Bibtex:\n{}".format(ctx.meta['bibtex'])) if ctx.meta.get('author'): log_info("Author: {0}".format(ctx.meta['author'])) if ctx.meta.get('citecnt'): log_info("Cite count: {0}".format(ctx.meta['citecnt']))
def handle_title_query(query): query = title_beautify(query) log_info("Get title query: {0}".format(query)) #starts search res = search_startswith(query) # and the idf is large if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res # similar search res = similar_search(query) if res: log_info(u"Found similar results in db: {0}".format(res['_id'])) return [res] # search on web searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item all_search_results = [] for s in async_results: s = s.get(ukconfig.PYTHON_POOL_TIMEOUT) if s is None: continue srs = s['results'] # try search database with updated title try: updated_title = s['ctx_update']['title'] except KeyError: pass else: if updated_title != query: query = updated_title res = search_exact(query) if res: log_info("Found {0} results in db: {1}".format( len(res), str([x['_id'] for x in res]))) return res all_search_results.extend(srs) meta = s.get('ctx_update') if meta: log_info('Meat update from searcher: {0}'.format(str(meta.keys()))) ctx.update_meta_dict(meta) pool.close() pool.terminate() # Analyse each result and try to parse info download_candidates = [] parser_used = set() found = False for sr in all_search_results: for parser in parsers: if parser.can_handle(sr): download_candidates.append((parser, sr)) if ctx.need_field(parser.support_meta_field): # Already tried this fetcher if not parser.repeatable and \ parser.name in parser_used: continue else: parser_used.add(parser.name) succ = parser.fetch_info(ctx, sr) if not succ: continue found = True if ctx.existing is not None: log_info("Found {0} results in db".format(len(ctx.existing))) return [ctx.existing] # no metadata or downloadable source found if not found and len(download_candidates) == 0: return None # Save data, return data and start downloading try: pid = new_paper(ctx) ret = [{'_id': pid, 'title': ctx.title, 'view_cnt': 1, 'download_cnt': 0 }] ret[0].update(ctx.meta) progress_dict[pid] = 0.0 if len(download_candidates) > 0: thread = Thread(target=start_download, args=(download_candidates, ctx, pid)) thread.start() return ret except: log_exc("Failed to save to db")
import searcher from job import JobContext from searcher import searcher_run if __name__ == '__main__': query = sys.argv[1] searchers = searcher.register_searcher.get_searcher_list() searchers = searchers[1:] print [k.name for k in searchers] ctx = JobContext(query) args = zip(searchers, [ctx] * len(searchers)) pool = Pool() async_results = [pool.apply_async(searcher_run, arg) for arg in args] # Search and get all the results item for s in async_results: s = s.get() if s is None: continue srs = s['results'] print srs meta = s.get('ctx_update') if meta: ctx.update_meta_dict(meta) pool.close() pool.terminate()