Example #1
0
def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice

    record_dicts = []
    record_dict = {"uid": 0, "value": "hello"}
    record_dicts.append(record_dict)

    # test it works at the level of the record
    record = Record(record_dict, unique_id_col='uid')

    # Test it works at the level of the db
    db_filename = tempfile.NamedTemporaryFile().name
    db = SearchDatabase(db_filename)

    db.write_list_dicts_parallel(record_dicts,
                                 unique_id_col='uid',
                                 batch_size=5)

    # Check records are written without uid values

    sql = 'select * from df'
    rec = db.conn.execute(sql).fetchall()[0]
    assert rec['unique_id'] == '0'
    assert '0' not in rec['concat_all']

    # Reconnect to file and check the unique_id_col is correct
    db2 = SearchDatabase(db_filename)

    assert db2.unique_id_col == 'uid'
Example #2
0
def test_build_and_search():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {"unique_id": 1, "first_name": "robin", "surname": "linacre"}
    rec2 = {"unique_id": 2, "first_name": "robyn", "surname": "linaker"}
    rec3 = {"unique_id": 3, "first_name": "robin", "surname": "linacre"}
    rec3 = {"unique_id": 4, "first_name": "david", "surname": "smith"}

    dicts = [rec1, rec2, rec3]
    db.write_list_dicts_parallel(dicts, unique_id_col="unique_id")

    db.build_or_replace_stats_tables()

    search_rec = {"unique_id": 4, "first_name": "robin", "surname": None}

    assert 1 in db.find_potental_matches(search_rec).keys()

    # With record caching, we want to make sure that if the search rec is changed but the unique id
    # is for some reason left the same, we get different search results

    search_rec = {"unique_id": 4, "first_name": "david", "surname": None}

    assert 4 in db.find_potental_matches(search_rec).keys()
Example #3
0
def test_json_problem():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {"unique_id": 1, "first_name": "robin", "int_problem": 1}
    rec2 = {"unique_id": 2, "first_name": "robyn", "int_problem": 2}
    rec3 = {"unique_id": 3, "first_name": "robin", "int_problem": 3}
    rec3 = {"unique_id": 4, "first_name": "david", "int_problem": None}

    import pandas as pd

    dicts = [rec1, rec2, rec3]
    df = pd.DataFrame(dicts)
    df["int_problem"] = df["int_problem"].astype(pd.Int64Dtype())

    db.write_pandas_dataframe(df, unique_id_col="unique_id")
def test_record():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {
        'unique_id': "rectest_1",
        'first_name': 'robin',
        'surname': 'linacre'
    }
    rec2 = {
        'unique_id': "rectest_2",
        'first_name': 'robyn',
        'surname': 'linaker'
    }
    rec3 = {
        'unique_id': "rectest_3",
        'first_name': 'robin',
        'surname': 'linacre'
    }

    dicts = [rec1, rec2, rec3]
    db.write_list_dicts_parallel(dicts, unique_id_col='unique_id')

    db.build_or_replace_stats_tables()

    # You have to be careful with caching here - deliberately do not include unique id here
    # Different unique ids should be assignd
    search_rec = {
        'unique_id': 'serach_rec_1',
        'first_name': 'robin',
        'surname': "smith"
    }

    r = Record(search_rec, 'unique_id', db.conn)

    assert 'ROBIN' in r.tokens_in_order_of_rarity
    assert 'SMITH' not in r.tokens_in_order_of_rarity

    search_rec = {
        'unique_id': 'serach_rec_2',
        'first_name': 'dave',
        'surname': "linacre"
    }
    r = Record(search_rec, 'unique_id', db.conn)

    assert 'LINACRE' in r.tokens_in_order_of_rarity
    assert 'DAVE' not in r.tokens_in_order_of_rarity
Example #5
0
def test_integrity(db_con_string):

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice
    if db_con_string == "temp":
        db_filename = tempfile.NamedTemporaryFile().name
    else:
        db_filename = db_con_string

    db = SearchDatabase(db_filename)

    rec_tokens = []
    rec_tokens.extend(["a"] * 1)
    rec_tokens.extend(["b"] * 2)
    rec_tokens.extend(["c"] * 3)
    rec_tokens.extend(["d"] * 4)

    records = []
    for rec_num, char in enumerate(rec_tokens):
        record = {"unique_id": rec_num, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    # At the moment, all tokens should have a count of 1

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.1

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'B'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.2

    # Add another 10 As.  Now there are 11 in 20
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Add another 10 As, with repeated IDs, so they should be skipped
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Token proportions should sum to 1
    sql_tkn_count = """
    select sum(token_proportion) as sum

    from value_token_counts

    """
    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["sum"] == 1.00
Example #6
0
def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)

    rec_tokens = []
    rec_tokens.extend(["a"] * 1)
    rec_tokens.extend(["b"] * 2)
    rec_tokens.extend(["c"] * 3)
    rec_tokens.extend(["d"] * 4)

    records = []
    for rec_num, char in enumerate(rec_tokens):
        record = {"unique_id": rec_num, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5,
                                 write_column_counters=False)

    # should be out of sync

    # Add another 10 As.  Now there are 11 in 20
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5,
                                 write_column_counters=False)

    # Status
    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false'

    db.write_all_col_counters_to_db()

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true'

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Add another 10 As, with repeated IDs, so they should be skipped
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Token proportions should sum to 1
    sql_tkn_count = """
    select sum(token_proportion) as sum

    from value_token_counts

    """
    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["sum"] == 1.00

    db2 = SearchDatabase(db_filename)

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true'

    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=5,
                                  write_column_counters=False)

    with pytest.warns(UserWarning):
        db3 = SearchDatabase(db_filename)

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false'
def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice
    db_filename = tempfile.NamedTemporaryFile().name
    db = SearchDatabase(db_filename)
    records = []
    for char in list(string.ascii_lowercase):
        record = {"unique_id": char, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    sql_df_count = """
    select count(*) as count from df
    """

    results = db.conn.execute(sql_df_count)
    results = results.fetchall()
    assert results[0]["count"] == 26

    db2 = SearchDatabase(db_filename)
    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=10)

    results = db2.conn.execute(sql_df_count)
    results = results.fetchall()
    assert results[0]["count"] == 26

    # At the moment, all tokens should have a count of 1

    sql_tkn_count = """
    select
        max(token_count) as max,
        min(token_count) as min,
        count(*) as count
    from value_token_counts
    """

    results = db2.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["max"] == 1
    assert results[0]["min"] == 1
    assert results[0]["count"] == 26

    # Note records deliberately includes 29 items now, we expect three new
    for char in ["a", "b", "c"]:
        record = {"unique_id": f"{char}_2", "value": char}
        records.append(record)

    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=10)

    results = db2.conn.execute(sql_df_count)
    results = results.fetchall()

    assert results[0]["count"] == 29

    results = db2.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["max"] == 2
    assert results[0]["min"] == 1
    assert results[0]["count"] == 26

    sql_count_a = """
    select token_count
    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_count_a)
    results = results.fetchall()
    assert results[0]["token_count"] == 2