def get_stmts(pmids_unread, cleanup=True, sparser_version=None): "Run sparser on the pmids in pmids_unread." if sparser_version is None: sparser_version = sparser.get_version() stmts = {} now = datetime.now() outbuf_fname = 'sparser_%s_%s.log' % ( now.strftime('%Y%m%d-%H%M%S'), mp.current_process().pid, ) outbuf = open(outbuf_fname, 'wb') try: for pmid, result in pmids_unread.items(): logger.info('Reading %s' % pmid) source = result['content_source'] cont_path = result['content_path'] outbuf.write(('\nReading pmid %s from %s located at %s.\n' % (pmid, source, cont_path)).encode('utf-8')) outbuf.flush() some_stmts = read_pmid(pmid, source, cont_path, sparser_version, outbuf, cleanup) if some_stmts is not None: stmts[pmid] = some_stmts else: continue # We didn't get any new statements. except KeyboardInterrupt as e: logger.exception(e) logger.info('Caught keyboard interrupt...stopping. \n' 'Results so far will be pickled unless ' 'Keyboard interupt is hit again.') finally: outbuf.close() print("Sparser logs may be found in %s" % outbuf_fname) return stmts
def run_sparser(pmid_list, tmp_dir, num_cores, start_index, end_index, force_read, force_fulltext, cleanup=True, verbose=True): 'Run the sparser reader on the pmids in pmid_list.' reader_version = sparser.get_version() _, _, _, pmids_read, pmids_unread, _ =\ get_content_to_read( pmid_list, start_index, end_index, tmp_dir, num_cores, force_fulltext, force_read, 'sparser', reader_version ) logger.info('Adjusting num cores to length of pmid_list.') num_cores = min(len(pmid_list), num_cores) logger.info('Adjusted...') if num_cores is 1: stmts = get_stmts(pmids_unread, cleanup=cleanup) stmts.update({ pmid: get_stmts_from_cache(pmid)[pmid] for pmid in pmids_read.keys() }) elif num_cores > 1: logger.info("Starting a pool with %d cores." % num_cores) pool = mp.Pool(num_cores) pmids_to_read = list(pmids_unread.keys()) N = len(pmids_unread) dn = int(N / num_cores) logger.info("Breaking pmids into batches.") batches = [] for i in range(num_cores): batches.append({ k: pmids_unread[k] for k in pmids_to_read[i * dn:min((i + 1) * dn, N)] }) get_stmts_func = functools.partial(get_stmts, cleanup=cleanup, sparser_version=reader_version) logger.info("Mapping get_stmts onto pool.") unread_res = pool.map(get_stmts_func, batches) logger.info('len(unread_res)=%d' % len(unread_res)) read_res = pool.map(get_stmts_from_cache, pmids_read.keys()) logger.info('len(read_res)=%d' % len(read_res)) pool.close() logger.info('Multiprocessing pool closed.') pool.join() logger.info('Multiprocessing pool joined.') stmts = { pmid: stmt_list for res_dict in unread_res + read_res for pmid, stmt_list in res_dict.items() } logger.info('len(stmts)=%d' % len(stmts)) return (stmts, pmids_unread)