def _get_loaded_db(num_stmts, batch_size=None, split=None, with_init_corpus=False): print("Creating and filling a test database:") db = _get_background_loaded_db() # Now load the statements. Much of this processing is the result of active # development, and once that is done, TODO: Format pickle to match copy_stmt_tuples, copy_col_names = _get_input_stmt_tuples(num_stmts) print("\tInserting the raw statements...") if split is None: db.copy('raw_statements', copy_stmt_tuples, copy_col_names) print("\tAdding agents...") db_util.insert_agents(db, 'raw') if with_init_corpus: print("\tAdding a preassembled corpus...") pa_manager = pm.PreassemblyManager() pa_manager.create_corpus(db) else: assert batch_size is not None num_initial = int(split * len(copy_stmt_tuples)) stmt_tuples_initial = random.sample(copy_stmt_tuples, num_initial) stmt_tuples_new = list( set(copy_stmt_tuples) - set(stmt_tuples_initial)) initial_datetime = datetime.now() - timedelta(days=2) db.copy('raw_statements', [t + (initial_datetime, ) for t in stmt_tuples_initial], copy_col_names + ('create_date', )) print("\tAdding agents...") db_util.insert_agents(db, 'raw') if with_init_corpus: print("\tAdding a preassembled corpus from first batch of raw " "stmts...") pa_manager = pm.PreassemblyManager(batch_size=batch_size) pa_manager.create_corpus(db) print("\tInserting the rest of the raw statements...") new_datetime = datetime.now() db.copy('raw_statements', [t + (new_datetime, ) for t in stmt_tuples_new], copy_col_names + ('create_date', )) print("\tAdding agents...") db_util.insert_agents(db, 'raw') return db
def add_statements(self, fraction=1, with_pa=False): """Add statements and agents to the database. Parameters ---------- fraction : float between 0 and 1 Default is 1. The fraction of remaining statements to be added. with_pa : bool Default False. Choose to run pre-assembly/incremental-preassembly on the added statements. """ available_tuples = self.get_available_stmt_tuples() if fraction is not 1: num_stmts = fraction*len(available_tuples) input_tuples = random.sample(available_tuples, num_stmts) else: input_tuples = available_tuples print("Loading %d statements..." % len(input_tuples)) if hasattr(self.test_db.RawStatements, 'id'): self.test_db.copy('raw_statements', input_tuples, self.test_data['raw_statements']['cols']) else: self.test_db.copy('raw_statements', [t[1:] for t in input_tuples], self.test_data['raw_statements']['cols'][1:]) print("Inserting agents...") db_util.insert_agents(self.test_db, 'raw') if with_pa: print("Preassembling new statements...") if len(input_tuples) > 100: batch_size = len(input_tuples)//10 pam = pm.PreassemblyManager(1, batch_size) else: pam = pm.PreassemblyManager() if self.used_stmt_tuples: pam.supplement_corpus(self.test_db) else: pam.create_corpus(self.test_db) return
def _check_preassembly_with_database(num_stmts, batch_size, n_proc=1): db = _get_loaded_db(num_stmts) # Now test the set of preassembled (pa) statements from the database against # what we get from old-fashioned preassembly (opa). opa_inp_stmts = _get_opa_input_stmts(db) # Get the set of raw statements. raw_stmt_list = db.select_all(db.RawStatements) all_raw_ids = {raw_stmt.id for raw_stmt in raw_stmt_list} assert len(raw_stmt_list) # Run the preassembly initialization. start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc, print_logs=True) pa_manager.create_corpus(db) end = datetime.now() print("Duration:", end-start) # Make sure the number of pa statements is within reasonable bounds. pa_stmt_list = db.select_all(db.PAStatements) assert 0 < len(pa_stmt_list) < len(raw_stmt_list) # Check the evidence links. raw_unique_link_list = db.select_all(db.RawUniqueLinks) assert len(raw_unique_link_list) all_link_ids = {ru.raw_stmt_id for ru in raw_unique_link_list} all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list} assert len(all_link_ids - all_raw_ids) is 0 assert all([pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list]) # Check the support links. sup_links = db.select_all([db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supported_mk_hash]) assert sup_links assert not any([l[0] == l[1] for l in sup_links]),\ "Found self-support in the database." # Try to get all the preassembled statements from the table. pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts), len(pa_stmt_list)) self_supports = { shash(s): shash(s) in {shash(s_) for s_ in s.supported_by + s.supports} for s in pa_stmts } if any(self_supports.values()): assert False, "Found self-support in constructed pa statement objects." _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts) return
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8): db = _get_loaded_db(num_stmts, split=split, with_init_corpus=True) start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size) print("Beginning supplement...") pa_manager.supplement_corpus(db) end = datetime.now() print("Duration of incremental update:", end-start) raw_stmts = db_client.get_statements([], preassembled=False, db=db) pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) _check_against_opa_stmts(db, raw_stmts, pa_stmts)
def _check_db_pa_supplement(num_stmts, batch_size, split=0.8, n_proc=1): pa_manager = pm.PreassemblyManager(batch_size=batch_size, n_proc=n_proc, print_logs=True) db = _get_loaded_db(num_stmts, split=split, pam=pa_manager) opa_inp_stmts = _get_opa_input_stmts(db) start = datetime.now() print("Beginning supplement...") pa_manager.supplement_corpus(db) end = datetime.now() print("Duration of incremental update:", end-start) pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) _check_against_opa_stmts(db, opa_inp_stmts, pa_stmts) return
def _check_preassembly_with_database(num_stmts, batch_size): db = _get_loaded_db(num_stmts) # Get the set of raw statements. raw_stmt_list = db.select_all(db.RawStatements) all_raw_uuids = {raw_stmt.uuid for raw_stmt in raw_stmt_list} assert len(raw_stmt_list) # Run the preassembly initialization. start = datetime.now() pa_manager = pm.PreassemblyManager(batch_size=batch_size) pa_manager.create_corpus(db) end = datetime.now() print("Duration:", end - start) pa_stmt_list = db.select_all(db.PAStatements) assert 0 < len(pa_stmt_list) < len(raw_stmt_list) raw_unique_link_list = db.select_all(db.RawUniqueLinks) assert len(raw_unique_link_list) all_link_uuids = {ru.raw_stmt_uuid for ru in raw_unique_link_list} all_link_mk_hashes = {ru.pa_stmt_mk_hash for ru in raw_unique_link_list} assert len(all_link_uuids - all_raw_uuids) is 0 assert all( [pa_stmt.mk_hash in all_link_mk_hashes for pa_stmt in pa_stmt_list]) num_support_links = db.filter_query(db.PASupportLinks).count() assert num_support_links # Try to get all the preassembled statements from the table. pa_stmts = db_client.get_statements([], preassembled=True, db=db, with_support=True) assert len(pa_stmts) == len(pa_stmt_list), (len(pa_stmts), len(pa_stmt_list)) # Now test the set of preassembled (pa) statements from the database against # what we get from old-fashioned preassembly (opa). raw_stmts = db_util.distill_stmts(db, get_full_stmts=True) _check_against_opa_stmts(raw_stmts, pa_stmts)