def main(): global args args = get_args() data = open(args.file).read() newdata = pdf_compress(data) if len(newdata) < len(data): newfilename = args.file + '.compressed' with open(newfilename, 'w') as fout: fout.write(newdata) os.remove(args.file) os.rename(newfilename, args.file)
def main(): global args args = get_args() query = args.title directory = args.directory searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() download_candidates = [] if re.match('^http[s]?://', query): # skip search ctx = JobContext("") sr = SearchResult(None, query) for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) else: #query = "Distinctive image features from scale-invariant keypoint" ctx = JobContext(query) search_args = zip(searchers, [ctx] * len(searchers)) pool = Pool() as_results = [pool.apply_async(searcher_run, arg) for arg in search_args] #results = [searcher_run(*arg) for arg in search_args] # for debug for s in as_results: s = s.get() if s is None: continue ctx.update_meta_dict(s['ctx_update']) print s['ctx_update'] ctx.try_update_title_from_search_result(s) for sr in s['results']: for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) pool.terminate() download_candidates = sorted( download_candidates, key=lambda x: x[0].priority, reverse=True) for (parser, sr) in download_candidates: data = parser.download(sr) if not data: continue data = pdf_compress(data) if ctx.title: ctx.title = finalize_filename(ctx.title) else: log_info("Failed to guess paper title!") ctx.title = "Unnamed Paper {}".format(md5(data)) filename = os.path.join(directory, ctx.title + ".pdf") if os.path.exists(filename): log_err("File \"{}\" exists! overwrite? (y/n)".format(os.path.basename(filename))) resp = raw_input() if resp not in ['y', 'Y']: log_info("No file written. Exiting...") break with open(filename, 'wb') as f: f.write(data) if args.output: os.rename(filename, args.output) log_info("Successfully downloaded to {0}".format(filename)) break else: log_err("Failed to download {0}".format(ctx.title)) if ctx.meta.get('bibtex'): log_info("Bibtex:\n{}".format(ctx.meta['bibtex'])) if ctx.meta.get('author'): log_info("Author: {0}".format(ctx.meta['author'])) if ctx.meta.get('citecnt'): log_info("Cite count: {0}".format(ctx.meta['citecnt']))
def main(): global args args = get_args() query = args.title directory = args.directory searchers = searcher.register_searcher.get_searcher_list() parsers = fetcher.register_parser.get_parser_list() download_candidates = [] if re.match('^http[s]?://', query): # skip search ctx = JobContext("") sr = SearchResult(None, query) for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) else: #query = "Distinctive image features from scale-invariant keypoint" ctx = JobContext(query) search_args = zip(searchers, [ctx] * len(searchers)) pool = Pool() as_results = [ pool.apply_async(searcher_run, arg) for arg in search_args ] #results = [searcher_run(*arg) for arg in search_args] # for debug for s in as_results: s = s.get() if s is None: continue ctx.update_meta_dict(s['ctx_update']) print s['ctx_update'] ctx.try_update_title_from_search_result(s) for sr in s['results']: for parser in parsers: if parser.can_handle(sr): parser.fetch_info(ctx, sr) # will update title download_candidates.append((parser, sr)) pool.terminate() download_candidates = sorted(download_candidates, key=lambda x: x[0].priority, reverse=True) for (parser, sr) in download_candidates: data = parser.download(sr) if not data: continue data = pdf_compress(data) if ctx.title: ctx.title = finalize_filename(ctx.title) else: log_info("Failed to guess paper title!") ctx.title = "Unnamed Paper {}".format(md5(data)) filename = os.path.join(directory, ctx.title + ".pdf") if os.path.exists(filename): log_err("File \"{}\" exists! overwrite? (y/n)".format( os.path.basename(filename))) resp = raw_input() if resp not in ['y', 'Y']: log_info("No file written. Exiting...") break with open(filename, 'wb') as f: f.write(data) if args.output: os.rename(filename, args.output) log_info("Successfully downloaded to {0}".format(filename)) break else: log_err("Failed to download {0}".format(ctx.title)) if ctx.meta.get('bibtex'): log_info("Bibtex:\n{}".format(ctx.meta['bibtex'])) if ctx.meta.get('author'): log_info("Author: {0}".format(ctx.meta['author'])) if ctx.meta.get('citecnt'): log_info("Cite count: {0}".format(ctx.meta['citecnt']))