Exemple #1
0
    def _run_reading(self, db, trids, max_refs=5000):
        if len(trids) > max_refs:
            raise ReadingUpdateError("Too many id's to run locally. Try "
                                     "running on batch (use_batch).")
        logger.info("Producing readings locally for %d new text refs." %
                    len(trids))
        base_dir = path.join(THIS_DIR, 'read_all_%s' % self.reader.name)
        reader_inst = self.reader(base_dir=base_dir, n_proc=self.n_proc)

        logger.info("Making readings...")
        outputs = rdb.produce_readings({'trid': trids}, [reader_inst],
                                       read_mode='unread_unread',
                                       db=db,
                                       prioritize=True,
                                       verbose=self.verbose)
        logger.info("Made %d readings." % len(outputs))
        logger.info("Making statements...")
        rdb.produce_statements(outputs, n_proc=self.n_proc, db=db)
        return
def test_reading_content_insert():
    "Test the content primary through-put of make_db_readings."
    db = get_db_with_pubmed_content()

    print("Test reading")
    tc_list = db.select_all(db.TextContent)
    readers = get_readers()
    reading_output = []
    for reader in readers:
        reading_output += reader.read([process_content(tc) for tc in tc_list],
                                      verbose=True)
    expected_output_len = len(tc_list)*len(readers)
    assert len(reading_output) == expected_output_len, \
        ("Not all text content successfully read."
         "Expected %d outputs, but got %d.") % (expected_output_len,
                                                len(reading_output))

    print("Test reading insert")
    rdb.upload_readings(reading_output, db=db)
    r_list = db.select_all(db.Readings)

    def is_complete_match(r_list, reading_output):
        return all([any([rd.matches(r) for r in r_list])
                    for rd in reading_output])

    assert is_complete_match(r_list, reading_output), \
        "Not all reading output posted."
    rdb.upload_readings(reading_output, db=db)
    assert is_complete_match(r_list, reading_output), \
        "Uniqueness constraints failed."

    print("Test enrichement")
    assert all([rd.reading_id is None for rd in reading_output]), \
        "No readings should have reading_ids already."
    rdb._enrich_reading_data(reading_output, db=db)
    assert all([rd.reading_id is not None for rd in reading_output]),\
        "Some reading data objects didn't have reading_ids after enrichment."

    print("Test making statements")
    stmts = rdb.produce_statements(reading_output, db=db)
    assert len(stmts), 'No statements created.'
    db_stmts = db.select_all(db.Statements)
    assert len(db_stmts) == len(stmts), \
        "Only %d/%d statements added." % (len(db_stmts), len(stmts))
    assert len(db.select_all(db.Agents)), "No agents added."
Exemple #3
0
    id_list_str = id_list_obj['Body'].read().decode('utf8').strip()
    id_str_list = id_list_str.splitlines()[args.start_index:args.end_index]
    random.shuffle(id_str_list)
    id_dict = get_id_dict([line.strip() for line in id_str_list])

    # Read everything ========================================
    outputs = produce_readings(id_dict, readers, verbose=True,
                               read_mode=args.mode,
                               force_fulltext=args.force_fulltext,
                               prioritize=(not args.read_all_fulltext))

    # Preserve the sparser logs
    contents = os.listdir('.')
    sparser_logs = [fname for fname in contents
                    if fname.startswith('sparser') and fname.endswith('log')]
    sparser_log_dir = ('reading_results/%s/logs/run_db_reading_queue/'
                       'sparser_logs_%s/') % (
                           args.basename,
                           datetime.now().strftime('%Y%m%d_%H%M%S')
                           )
    for fname in sparser_logs:
        s3_key = sparser_log_dir + fname
        logger.info("Saving sparser logs to %s on s3 in %s."
                    % (s3_key, bucket_name))
        with open(fname, 'r') as f:
            client.put_object(Key=s3_key, Body=f.read(),
                              Bucket=bucket_name)

    # Convert the outputs to statements ==================================
    produce_statements(outputs, n_proc=args.num_cores)
Exemple #4
0
                               read_mode=args.read_mode,
                               get_preexisting=(args.stmt_mode == 'all'),
                               force_fulltext=args.force_fulltext,
                               prioritize=args.use_best_fulltext, db=db)
    ends['reading'] = datetime.now()

    # Preserve the sparser logs
    contents = os.listdir('.')
    sparser_logs = [fname for fname in contents
                    if fname.startswith('sparser') and fname.endswith('log')]
    sparser_log_dir = s3_log_prefix + 'sparser_logs/'
    for fname in sparser_logs:
        s3_key = sparser_log_dir + fname
        logger.info("Saving sparser logs to %s on s3 in %s."
                    % (s3_key, bucket_name))
        with open(fname, 'r') as f:
            client.put_object(Key=s3_key, Body=f.read(),
                              Bucket=bucket_name)

    # Convert the outputs to statements ==================================
    if args.stmt_mode != 'none':
        starts['statement production'] = datetime.now()
        stmt_data = produce_statements(outputs, n_proc=args.num_cores, db=db)
        ends['statement production'] = datetime.now()
    else:
        stmt_data = []

    report_statistics(outputs, stmt_data, starts, ends, s3_log_prefix, client,
                      bucket_name)