Esempio n. 1
0
def _get_loaded_db(num_stmts,
                   batch_size=None,
                   split=None,
                   with_init_corpus=False):
    print("Creating and filling a test database:")
    db = _get_background_loaded_db()

    # Now load the statements. Much of this processing is the result of active
    # development, and once that is done, TODO: Format pickle to match
    copy_stmt_tuples, copy_col_names = _get_input_stmt_tuples(num_stmts)

    print("\tInserting the raw statements...")
    if split is None:
        db.copy('raw_statements', copy_stmt_tuples, copy_col_names)
        print("\tAdding agents...")
        db_util.insert_agents(db, 'raw')
        if with_init_corpus:
            print("\tAdding a preassembled corpus...")
            pa_manager = pm.PreassemblyManager()
            pa_manager.create_corpus(db)
    else:
        assert batch_size is not None
        num_initial = int(split * len(copy_stmt_tuples))
        stmt_tuples_initial = random.sample(copy_stmt_tuples, num_initial)
        stmt_tuples_new = list(
            set(copy_stmt_tuples) - set(stmt_tuples_initial))
        initial_datetime = datetime.now() - timedelta(days=2)
        db.copy('raw_statements',
                [t + (initial_datetime, ) for t in stmt_tuples_initial],
                copy_col_names + ('create_date', ))
        print("\tAdding agents...")
        db_util.insert_agents(db, 'raw')
        if with_init_corpus:
            print("\tAdding a preassembled corpus from first batch of raw "
                  "stmts...")
            pa_manager = pm.PreassemblyManager(batch_size=batch_size)
            pa_manager.create_corpus(db)
        print("\tInserting the rest of the raw statements...")
        new_datetime = datetime.now()
        db.copy('raw_statements',
                [t + (new_datetime, ) for t in stmt_tuples_new],
                copy_col_names + ('create_date', ))
        print("\tAdding agents...")
        db_util.insert_agents(db, 'raw')
    return db
Esempio n. 2
0
    def add_statements(self, fraction=1, with_pa=False):
        """Add statements and agents to the database.

        Parameters
        ----------
        fraction : float between 0 and 1
            Default is 1. The fraction of remaining statements to be added.
        with_pa : bool
            Default False. Choose to run pre-assembly/incremental-preassembly
            on the added statements.
        """
        available_tuples = self.get_available_stmt_tuples()
        if fraction is not 1:
            num_stmts = fraction*len(available_tuples)
            input_tuples = random.sample(available_tuples, num_stmts)
        else:
            input_tuples = available_tuples

        print("Loading %d statements..." % len(input_tuples))
        if hasattr(self.test_db.RawStatements, 'id'):
            self.test_db.copy('raw_statements', input_tuples,
                               self.test_data['raw_statements']['cols'])
        else:
            self.test_db.copy('raw_statements', [t[1:] for t in input_tuples],
                              self.test_data['raw_statements']['cols'][1:])

        print("Inserting agents...")
        db_util.insert_agents(self.test_db, 'raw')

        if with_pa:
            print("Preassembling new statements...")
            if len(input_tuples) > 100:
                batch_size = len(input_tuples)//10
                pam = pm.PreassemblyManager(1, batch_size)
            else:
                pam = pm.PreassemblyManager()

            if self.used_stmt_tuples:
                pam.supplement_corpus(self.test_db)
            else:
                pam.create_corpus(self.test_db)

        return
Esempio n. 3
0
def _check_preassembly_with_database(num_stmts, batch_size, n_proc=1):
    db = _get_loaded_db(num_stmts)

    # Now test the set of preassembled (pa) statements from the database against
    # what we get from old-fashioned preassembly (opa).
    opa_inp_stmts = _get_opa_input_stmts(db)

    # Get the set of raw statements.
    raw_stmt_list = db.select_all(db.RawStatements)
    all_raw_ids = {raw_stmt.id for raw_stmt in raw_stmt_list}
    assert len(raw_stmt_list)

    # Run the preassembly initialization.
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc,
                                       print_logs=True)
    pa_manager.create_corpus(db)
    end = datetime.now()
    print("Duration:", end-start)

    # Make sure the number of pa statements is within reasonable bounds.
    pa_stmt_list = db.select_all(db.PAStatements)
    assert 0 < len(pa_stmt_list) < len(raw_stmt_list)

    # Check the evidence links.
    raw_unique_link_list = db.select_all(db.RawUniqueLinks)
    assert len(raw_unique_link_list)
    all_link_ids = {ru.raw_stmt_id for ru in raw_unique_link_list}
    all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list}
    assert len(all_link_ids - all_raw_ids) is 0
    assert all([pa_stmt.mk_hash in all_link_mk_hashes
                for pa_stmt in pa_stmt_list])

    # Check the support links.
    sup_links = db.select_all([db.PASupportLinks.supporting_mk_hash,
                               db.PASupportLinks.supported_mk_hash])
    assert sup_links
    assert not any([l[0] == l[1] for l in sup_links]),\
        "Found self-support in the database."

    # Try to get all the preassembled statements from the table.
    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts),
                                                len(pa_stmt_list))

    self_supports = {
        shash(s): shash(s) in {shash(s_) for s_ in s.supported_by + s.supports}
        for s in pa_stmts
        }
    if any(self_supports.values()):
        assert False, "Found self-support in constructed pa statement objects."

    _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts)
    return
Esempio n. 4
0
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8):
    db = _get_loaded_db(num_stmts, split=split, with_init_corpus=True)
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    print("Beginning supplement...")
    pa_manager.supplement_corpus(db)
    end = datetime.now()
    print("Duration of incremental update:", end-start)

    raw_stmts = db_client.get_statements([], preassembled=False, db=db)
    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    _check_against_opa_stmts(db, raw_stmts, pa_stmts)
Esempio n. 5
0
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8, n_proc=1):
    pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc,
                                       print_logs=True)
    db = _get_loaded_db(num_stmts, split=split, pam=pa_manager)
    opa_inp_stmts = _get_opa_input_stmts(db)
    start = datetime.now()
    print("Beginning supplement...")
    pa_manager.supplement_corpus(db)
    end = datetime.now()
    print("Duration of incremental update:", end-start)

    pa_stmts = db_client.get_statements([], preassembled=True, db=db,
                                        with_support=True)
    _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts)
    return
Esempio n. 6
0
def _check_preassembly_with_database(num_stmts, batch_size):
    db = _get_loaded_db(num_stmts)

    # Get the set of raw statements.
    raw_stmt_list = db.select_all(db.RawStatements)
    all_raw_uuids = {raw_stmt.uuid for raw_stmt in raw_stmt_list}
    assert len(raw_stmt_list)

    # Run the preassembly initialization.
    start = datetime.now()
    pa_manager = pm.PreassemblyManager(batch_size=batch_size)
    pa_manager.create_corpus(db)
    end = datetime.now()
    print("Duration:", end - start)
    pa_stmt_list = db.select_all(db.PAStatements)
    assert 0 < len(pa_stmt_list) < len(raw_stmt_list)
    raw_unique_link_list = db.select_all(db.RawUniqueLinks)
    assert len(raw_unique_link_list)
    all_link_uuids = {ru.raw_stmt_uuid for ru in raw_unique_link_list}
    all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list}
    assert len(all_link_uuids - all_raw_uuids) is 0
    assert all(
        [pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list])
    num_support_links = db.filter_query(db.PASupportLinks).count()
    assert num_support_links

    # Try to get all the preassembled statements from the table.
    pa_stmts = db_client.get_statements([],
                                        preassembled=True,
                                        db=db,
                                        with_support=True)
    assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts),
                                                len(pa_stmt_list))

    # Now test the set of preassembled (pa) statements from the database against
    # what we get from old-fashioned preassembly (opa).
    raw_stmts = db_util.distill_stmts(db, get_full_stmts=True)
    _check_against_opa_stmts(raw_stmts, pa_stmts)