def test_integrity(): # Want to test database insert functionality to check that: # 1. It's not possible to add the same unique_id twice # 2. Token counts are computed correctly when you try and add the same unique_id twice record_dicts = [] record_dict = {"uid": 0, "value": "hello"} record_dicts.append(record_dict) # test it works at the level of the record record = Record(record_dict, unique_id_col='uid') # Test it works at the level of the db db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) db.write_list_dicts_parallel(record_dicts, unique_id_col='uid', batch_size=5) # Check records are written without uid values sql = 'select * from df' rec = db.conn.execute(sql).fetchall()[0] assert rec['unique_id'] == '0' assert '0' not in rec['concat_all'] # Reconnect to file and check the unique_id_col is correct db2 = SearchDatabase(db_filename) assert db2.unique_id_col == 'uid'
def test_build_and_search(): db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) rec1 = {"unique_id": 1, "first_name": "robin", "surname": "linacre"} rec2 = {"unique_id": 2, "first_name": "robyn", "surname": "linaker"} rec3 = {"unique_id": 3, "first_name": "robin", "surname": "linacre"} rec3 = {"unique_id": 4, "first_name": "david", "surname": "smith"} dicts = [rec1, rec2, rec3] db.write_list_dicts_parallel(dicts, unique_id_col="unique_id") db.build_or_replace_stats_tables() search_rec = {"unique_id": 4, "first_name": "robin", "surname": None} assert 1 in db.find_potental_matches(search_rec).keys() # With record caching, we want to make sure that if the search rec is changed but the unique id # is for some reason left the same, we get different search results search_rec = {"unique_id": 4, "first_name": "david", "surname": None} assert 4 in db.find_potental_matches(search_rec).keys()
def test_json_problem(): db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) rec1 = {"unique_id": 1, "first_name": "robin", "int_problem": 1} rec2 = {"unique_id": 2, "first_name": "robyn", "int_problem": 2} rec3 = {"unique_id": 3, "first_name": "robin", "int_problem": 3} rec3 = {"unique_id": 4, "first_name": "david", "int_problem": None} import pandas as pd dicts = [rec1, rec2, rec3] df = pd.DataFrame(dicts) df["int_problem"] = df["int_problem"].astype(pd.Int64Dtype()) db.write_pandas_dataframe(df, unique_id_col="unique_id")
def test_record(): db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) rec1 = { 'unique_id': "rectest_1", 'first_name': 'robin', 'surname': 'linacre' } rec2 = { 'unique_id': "rectest_2", 'first_name': 'robyn', 'surname': 'linaker' } rec3 = { 'unique_id': "rectest_3", 'first_name': 'robin', 'surname': 'linacre' } dicts = [rec1, rec2, rec3] db.write_list_dicts_parallel(dicts, unique_id_col='unique_id') db.build_or_replace_stats_tables() # You have to be careful with caching here - deliberately do not include unique id here # Different unique ids should be assignd search_rec = { 'unique_id': 'serach_rec_1', 'first_name': 'robin', 'surname': "smith" } r = Record(search_rec, 'unique_id', db.conn) assert 'ROBIN' in r.tokens_in_order_of_rarity assert 'SMITH' not in r.tokens_in_order_of_rarity search_rec = { 'unique_id': 'serach_rec_2', 'first_name': 'dave', 'surname': "linacre" } r = Record(search_rec, 'unique_id', db.conn) assert 'LINACRE' in r.tokens_in_order_of_rarity assert 'DAVE' not in r.tokens_in_order_of_rarity
def test_integrity(db_con_string): # Want to test database insert functionality to check that: # 1. It's not possible to add the same unique_id twice # 2. Token counts are computed correctly when you try and add the same unique_id twice if db_con_string == "temp": db_filename = tempfile.NamedTemporaryFile().name else: db_filename = db_con_string db = SearchDatabase(db_filename) rec_tokens = [] rec_tokens.extend(["a"] * 1) rec_tokens.extend(["b"] * 2) rec_tokens.extend(["c"] * 3) rec_tokens.extend(["d"] * 4) records = [] for rec_num, char in enumerate(rec_tokens): record = {"unique_id": rec_num, "value": char} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5) db._update_token_stats_tables() # At the moment, all tokens should have a count of 1 sql_tkn_count = """ select token_proportion from value_token_counts where token = 'A' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.1 sql_tkn_count = """ select token_proportion from value_token_counts where token = 'B' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.2 # Add another 10 As. Now there are 11 in 20 records = [] for i in range(10, 20): record = {"unique_id": i, "value": "a"} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5) db._update_token_stats_tables() sql_tkn_count = """ select token_proportion from value_token_counts where token = 'A' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.55 # Add another 10 As, with repeated IDs, so they should be skipped records = [] for i in range(10, 20): record = {"unique_id": i, "value": "a"} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5) db._update_token_stats_tables() sql_tkn_count = """ select token_proportion from value_token_counts where token = 'A' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.55 # Token proportions should sum to 1 sql_tkn_count = """ select sum(token_proportion) as sum from value_token_counts """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["sum"] == 1.00
def test_integrity(): # Want to test database insert functionality to check that: # 1. It's not possible to add the same unique_id twice # 2. Token counts are computed correctly when you try and add the same unique_id twice db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) rec_tokens = [] rec_tokens.extend(["a"] * 1) rec_tokens.extend(["b"] * 2) rec_tokens.extend(["c"] * 3) rec_tokens.extend(["d"] * 4) records = [] for rec_num, char in enumerate(rec_tokens): record = {"unique_id": rec_num, "value": char} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5, write_column_counters=False) # should be out of sync # Add another 10 As. Now there are 11 in 20 records = [] for i in range(10, 20): record = {"unique_id": i, "value": "a"} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5, write_column_counters=False) # Status assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false' db.write_all_col_counters_to_db() assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true' db._update_token_stats_tables() sql_tkn_count = """ select token_proportion from value_token_counts where token = 'A' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.55 # Add another 10 As, with repeated IDs, so they should be skipped records = [] for i in range(10, 20): record = {"unique_id": i, "value": "a"} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5) db._update_token_stats_tables() sql_tkn_count = """ select token_proportion from value_token_counts where token = 'A' """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["token_proportion"] == 0.55 # Token proportions should sum to 1 sql_tkn_count = """ select sum(token_proportion) as sum from value_token_counts """ results = db.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["sum"] == 1.00 db2 = SearchDatabase(db_filename) assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true' db2.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5, write_column_counters=False) with pytest.warns(UserWarning): db3 = SearchDatabase(db_filename) assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false'
def test_integrity(): # Want to test database insert functionality to check that: # 1. It's not possible to add the same unique_id twice # 2. Token counts are computed correctly when you try and add the same unique_id twice db_filename = tempfile.NamedTemporaryFile().name db = SearchDatabase(db_filename) records = [] for char in list(string.ascii_lowercase): record = {"unique_id": char, "value": char} records.append(record) db.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=5) sql_df_count = """ select count(*) as count from df """ results = db.conn.execute(sql_df_count) results = results.fetchall() assert results[0]["count"] == 26 db2 = SearchDatabase(db_filename) db2.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=10) results = db2.conn.execute(sql_df_count) results = results.fetchall() assert results[0]["count"] == 26 # At the moment, all tokens should have a count of 1 sql_tkn_count = """ select max(token_count) as max, min(token_count) as min, count(*) as count from value_token_counts """ results = db2.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["max"] == 1 assert results[0]["min"] == 1 assert results[0]["count"] == 26 # Note records deliberately includes 29 items now, we expect three new for char in ["a", "b", "c"]: record = {"unique_id": f"{char}_2", "value": char} records.append(record) db2.write_list_dicts_parallel(records, unique_id_col='unique_id', batch_size=10) results = db2.conn.execute(sql_df_count) results = results.fetchall() assert results[0]["count"] == 29 results = db2.conn.execute(sql_tkn_count) results = results.fetchall() assert results[0]["max"] == 2 assert results[0]["min"] == 1 assert results[0]["count"] == 26 sql_count_a = """ select token_count from value_token_counts where token = 'A' """ results = db.conn.execute(sql_count_a) results = results.fetchall() assert results[0]["token_count"] == 2