Example #1
0
def get_stmts(pmids_unread, cleanup=True, sparser_version=None):
    "Run sparser on the pmids in pmids_unread."
    if sparser_version is None:
        sparser_version = sparser.get_version()
    stmts = {}
    now = datetime.now()
    outbuf_fname = 'sparser_%s_%s.log' % (
        now.strftime('%Y%m%d-%H%M%S'),
        mp.current_process().pid,
    )
    outbuf = open(outbuf_fname, 'wb')
    try:
        for pmid, result in pmids_unread.items():
            logger.info('Reading %s' % pmid)
            source = result['content_source']
            cont_path = result['content_path']
            outbuf.write(('\nReading pmid %s from %s located at %s.\n' %
                          (pmid, source, cont_path)).encode('utf-8'))
            outbuf.flush()
            some_stmts = read_pmid(pmid, source, cont_path, sparser_version,
                                   outbuf, cleanup)
            if some_stmts is not None:
                stmts[pmid] = some_stmts
            else:
                continue  # We didn't get any new statements.
    except KeyboardInterrupt as e:
        logger.exception(e)
        logger.info('Caught keyboard interrupt...stopping. \n'
                    'Results so far will be pickled unless '
                    'Keyboard interupt is hit again.')
    finally:
        outbuf.close()
        print("Sparser logs may be found in %s" % outbuf_fname)
    return stmts
Example #2
0
def run_sparser(pmid_list,
                tmp_dir,
                num_cores,
                start_index,
                end_index,
                force_read,
                force_fulltext,
                cleanup=True,
                verbose=True):
    'Run the sparser reader on the pmids in pmid_list.'
    reader_version = sparser.get_version()
    _, _, _, pmids_read, pmids_unread, _ =\
        get_content_to_read(
            pmid_list, start_index, end_index, tmp_dir, num_cores,
            force_fulltext, force_read, 'sparser', reader_version
            )

    logger.info('Adjusting num cores to length of pmid_list.')
    num_cores = min(len(pmid_list), num_cores)
    logger.info('Adjusted...')
    if num_cores is 1:
        stmts = get_stmts(pmids_unread, cleanup=cleanup)
        stmts.update({
            pmid: get_stmts_from_cache(pmid)[pmid]
            for pmid in pmids_read.keys()
        })
    elif num_cores > 1:
        logger.info("Starting a pool with %d cores." % num_cores)
        pool = mp.Pool(num_cores)
        pmids_to_read = list(pmids_unread.keys())
        N = len(pmids_unread)
        dn = int(N / num_cores)
        logger.info("Breaking pmids into batches.")
        batches = []
        for i in range(num_cores):
            batches.append({
                k: pmids_unread[k]
                for k in pmids_to_read[i * dn:min((i + 1) * dn, N)]
            })
        get_stmts_func = functools.partial(get_stmts,
                                           cleanup=cleanup,
                                           sparser_version=reader_version)
        logger.info("Mapping get_stmts onto pool.")
        unread_res = pool.map(get_stmts_func, batches)
        logger.info('len(unread_res)=%d' % len(unread_res))
        read_res = pool.map(get_stmts_from_cache, pmids_read.keys())
        logger.info('len(read_res)=%d' % len(read_res))
        pool.close()
        logger.info('Multiprocessing pool closed.')
        pool.join()
        logger.info('Multiprocessing pool joined.')
        stmts = {
            pmid: stmt_list
            for res_dict in unread_res + read_res
            for pmid, stmt_list in res_dict.items()
        }
        logger.info('len(stmts)=%d' % len(stmts))

    return (stmts, pmids_unread)