def _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file): """Internal method that trains the deduper model using active learning. Parameters ---------- clean_data : dict The dictionary form of the gazette that gazetteer_dedupe requires. messy_data : dict The dictionary form of the messy data that needs to be deduplicated (and canonicalized) sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. deduper : a gazetteer model instance training_file : str A path to a training file that will be loaded to keep training from. settings_file : str A path to a settings file that will be loaded if it exists. Returns ------- dedupe.Gazetteer A trained gazetteer model instance. """ # To train dedupe, we feed it a sample of records. sample_num = math.floor(len(messy_data) * sample_size) deduper.prepare_training(clean_data, messy_data, sample_size=sample_num) print('Starting active labeling...') dedupe.console_label(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.write_training(tf) # Save our weights and predicates to disk. with open(settings_file, 'wb') as sf: deduper.write_settings(sf) return deduper
if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: gazetteer.prepare_training(messy, canonical, training_file=tf) else: gazetteer.prepare_training(messy, canonical) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.console_label(gazetteer) gazetteer.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: gazetteer.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: gazetteer.write_settings(sf) gazetteer.cleanup_training()
if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_d, f) else: deduper.prepare_training(data_d) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.console_label(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.write_settings(sf)
linker.prepare_training(data_1, data_2, training_file=tf, sample_size=15000) else: linker.prepare_training(data_1, data_2, sample_size=15000) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.console_label(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.write_settings(sf) # ## Blocking
def run_dedupe(settings_file, training_file, type): start_time = time.time() print("dedupe file ", dedupe.__file__) print("dedupe path ", dedupe.__path__) print("dedupe name ", dedupe.__name__) print("dedupe spec ", dedupe.__spec__) print("dedupe doc ", dedupe.__doc__) print("dedupe loader ", dedupe.__loader__) # Set the database connection from environment variable using # [dj_database_url](https://github.com/kennethreitz/dj-database-url) # For example: # export DATABASE_URL=postgres://user:password@host/mydatabase db_conf = dj_database_url.config() if not db_conf: raise Exception( 'set DATABASE_URL environment variable with your connection, e.g. ' 'export DATABASE_URL=postgres://user:password@host/mydatabase') read_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) write_con = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) # We'll be using variations on this following select statement to pull # in campaign donor info. if type == 'IND': DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person = 1 and name not like '%unitem%' " else: DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \ "from processed_donors where person != 1 and name not like '%unitem%' " # ## Training if os.path.exists(settings_file): print('reading from ', settings_file) with open(settings_file, 'rb') as sf: deduper = dedupe.StaticDedupe(sf, num_cores=1) else: # Define the fields dedupe will pay attention to # # The street, city, and zip fields are often missing, so we'll # tell dedupe that, and we'll learn a model that take that into # account fields = [ { 'field': 'name', 'type': 'String' }, { 'field': 'street', 'type': 'String', 'has missing': True }, { 'field': 'city', 'type': 'String', 'has missing': True }, { 'field': 'state', 'type': 'ShortString', 'has missing': True }, { 'field': 'zip', 'type': 'ShortString', 'has missing': True }, ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields, num_cores=1) # Named cursor runs server side with psycopg2 with read_con.cursor('donor_select') as cur: cur.execute(DONOR_SELECT) temp_d = {i: row for i, row in enumerate(cur)} # If we have training data saved from a previous run of dedupe, # look for it an load it in. # # __Note:__ if you want to train from # scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: deduper.prepare_training(temp_d, tf) else: deduper.prepare_training(temp_d) del temp_d # ## Active learning print('starting active labeling...') # Starts the training loop. Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished dedupe.console_label(deduper) # When finished, save our labeled, training pairs to disk with open(training_file, 'w') as tf: deduper.write_training(tf) # Notice our argument here # # `recall` is the proportion of true dupes pairs that the learned # rules must cover. You may want to reduce this if your are making # too many blocks and too many comparisons. deduper.train(recall=0.90) with open(settings_file, 'wb') as sf: deduper.write_settings(sf) # We can now remove some of the memory hogging objects we used # for training deduper.cleanup_training() print(f'deduper predicates: {deduper.predicates}') # ## Blocking print('blocking...') # To run blocking on such a large set of data, we create a separate table # that contains blocking keys and record ids print('creating blocking_map database') blocking_table = 'blocking_map_' + type with write_con: with write_con.cursor() as cur: cur.execute("DROP TABLE IF EXISTS " + blocking_table) cur.execute("CREATE TABLE " + blocking_table + " " "(block_key text, donor_id INTEGER)") # If dedupe learned a Index Predicate, we have to take a pass # through the data and create indices. print('creating inverted index') for field in deduper.fingerprinter.index_fields: with read_con.cursor('field_values') as cur: cur.execute("SELECT DISTINCT %s FROM processed_donors" % field) field_data = (row[field] for row in cur) deduper.fingerprinter.index(field_data, field) # Now we are ready to write our blocking map table by creating a # generator that yields unique `(block_key, donor_id)` tuples. print('writing blocking map') with read_con.cursor('donor_select') as read_cur: read_cur.execute(DONOR_SELECT) full_data = ((row['donor_id'], row) for row in read_cur) b_data = deduper.fingerprinter(full_data) with write_con: with write_con.cursor() as write_cur: write_cur.copy_expert('COPY ' + blocking_table + ' FROM STDIN WITH CSV', Readable(b_data), size=100000) # free up memory by removing indices deduper.fingerprinter.reset_indices() logging.info("indexing block_key") with write_con: with write_con.cursor() as cur: cur.execute("CREATE UNIQUE INDEX ON " + blocking_table + " " "(block_key text_pattern_ops, donor_id)") # ## Clustering entity_map_table = "entity_map_" + type with write_con: with write_con.cursor() as cur: cur.execute("DROP TABLE IF EXISTS " + entity_map_table) print('creating entity_map database') cur.execute("CREATE TABLE " + entity_map_table + " " "(donor_id INTEGER, canon_id INTEGER, " " cluster_score FLOAT, PRIMARY KEY(donor_id))") read_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) with read_con1.cursor( 'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur: read_cur.execute( "SELECT a.donor_id, " "row_to_json((SELECT d FROM (SELECT a.city, " "a.name, " "a.zip, " "a.state, " "a.street) d)), " "b.donor_id, " "row_to_json((SELECT d FROM (SELECT b.city, " "b.name, " "b.zip, " "b.state, " "b.street) d)) " "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west " "FROM " + blocking_table + " AS l " "INNER JOIN " + blocking_table + " AS r " "USING (block_key) " "WHERE l.donor_id < r.donor_id) ids " "INNER JOIN processed_donors a ON ids.east=a.donor_id " "INNER JOIN processed_donors b ON ids.west=b.donor_id") print('clustering...') clustered_dupes = deduper.cluster(deduper.score( record_pairs(read_cur)), threshold=0.5) # ## Writing out results # We now have a sequence of tuples of donor ids that dedupe believes # all refer to the same entity. We write this out onto an entity map # table print('writing results') write_con1 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT']) with write_con1: with write_con1.cursor() as write_cur: write_cur.copy_expert('COPY ' + entity_map_table + ' FROM STDIN WITH CSV', Readable(cluster_ids(clustered_dupes)), size=100000) with write_con1: with write_con1.cursor() as cur: cur.execute("CREATE INDEX head_index_" + type + " ON " + entity_map_table + " (canon_id)") # Print out the number of duplicates found # ## Payoff # With all this done, we can now begin to ask interesting questions # of the data # # For example, let's see who the top 10 donors are. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') # for pretty printing numbers read_con2 = psycopg2.connect(database=db_conf['NAME'], user=db_conf['USER'], password=db_conf['PASSWORD'], host=db_conf['HOST'], port=db_conf['PORT'], cursor_factory=psycopg2.extras.RealDictCursor) # save entity map entity_map_filename = entity_map_table + "_" + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' donors_filename = 'processed_donors_' + settings_file.split( '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M', time.localtime()) + '.csv' with read_con2.cursor() as cur: with open(entity_map_filename, 'w') as file_out: cur.copy_expert( 'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER', file_out) with open(donors_filename, 'w') as file_out: cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER', file_out) if type == 'IND': top_donor_where = 'donors.corp is null' else: top_donor_where = 'donors.corp is not null' # Create a temporary table so each group and unmatched record has # a unique id with read_con2.cursor() as cur: cur.execute( "CREATE TEMPORARY TABLE e_map " "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id " "FROM " + entity_map_table + " " "RIGHT JOIN donors USING(donor_id)") cur.execute( "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "donation_totals.totals AS totals " "FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors INNER JOIN " "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals " " FROM contributions INNER JOIN e_map " " USING (donor_id) " " GROUP BY (canon_id) " " ORDER BY totals " " DESC) " "AS donation_totals ON donors.donor_id=donation_totals.canon_id " "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10" ) print("Top Donors (deduped)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) # Compare this to what we would have gotten if we hadn't done any # deduplication cur.execute( "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " "SUM(CAST(contributions.amount AS FLOAT)) AS totals " "FROM donors INNER JOIN contributions " "USING (donor_id) " "WHERE " + top_donor_where + " " "GROUP BY (donor_id) " "ORDER BY totals DESC) AS t where totals IS NOT NULL " "LIMIT 10") print("Top Donors (raw)") for row in cur: row['totals'] = locale.currency(row['totals'], grouping=True) print('%(totals)20s: %(name)s' % row) cur.execute( " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors " " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC LIMIT 10") # ##Print stats that are saved to match_runs table # Biggest cluster first print("Biggest Clusters") biggest_name = '' for row in cur: print('%(cluster_size)20s: %(name)s' % row) if biggest_name == '': biggest_name = row['name'] + ':' + str(row['cluster_id']) # Then total of donors if type == 'IND': processed_donors_where = '= 1' else: processed_donors_where = '!= 1' cur.execute( " SELECT count(*) AS num_donors FROM processed_donors WHERE person " + processed_donors_where) for row in cur: number_of_donors = row['num_donors'] # Then max, average and number of clusters cur.execute( " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters " " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, " " cluster_size, cluster_id " " FROM (SELECT * FROM donors WHERE " + top_donor_where + ") as donors" " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id " " FROM " + entity_map_table + " " " GROUP BY canon_id) " " AS cluster_totals " " ON donors.donor_id = cluster_totals.cluster_id " " ORDER BY cluster_size DESC) AS stats") # Print the stats print("stats...") print(f"total number of donors: {number_of_donors}") for row in cur: print( 'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s' % row) biggest_size = row['biggest'] average_size = row['average'] number_of_clusters = row['number_of_clusters'] # write to the match_run table runtime = time.time() - start_time donor_cluster_ratio = number_of_donors / number_of_clusters with write_con1.cursor() as cur: cur.execute( """ INSERT INTO match_runs (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster, total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file) VALUES (NOW(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, (' '.join(str(pred) for pred in deduper.predicates), number_of_clusters, average_size, biggest_size, biggest_name, number_of_donors, type, runtime, donor_cluster_ratio, settings_file)) write_con1.commit() read_con.close() write_con.close() read_con1.close() write_con1.close() read_con2.close() print('ran in', runtime, 'seconds')
def link_dataframes(dfa, dfb, field_properties, config_name="link_dataframes", n_cores=None): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('Importing data ...') dfa = clean_punctuation(dfa) specify_type(dfa, field_properties) dfa['index_field'] = dfa.index dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x)) dfa.set_index(['index_field'], inplace=True) data_1 = dfa.to_dict(orient='index') dfb = clean_punctuation(dfb) specify_type(dfb, field_properties) dfb['index_field'] = dfb.index dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x)) dfb.set_index(['index_field'], inplace=True) data_2 = dfb.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as sf: linker = dedupe.StaticRecordLink(sf, num_cores=n_cores) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields, num_cores=n_cores) # To train the linker, we feed it a sample of records. linker.prepare_training(data_1, data_2, sample_size=15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('Reading labeled examples from ', training_file) with open(training_file) as tf: linker.prepare_training(data, training_file=tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('Starting active labeling...') dedupe.console_label(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.write_settings(sf) # ## Blocking # ## Clustering # Find the threshold that will maximize a weighted average of our # precision and recall. When we set the recall weight to 2, we are # saying we care twice as much about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. print('Clustering...') linked_records = linker.join(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) #Convert linked records into dataframe df_linked_records = pd.DataFrame(linked_records) df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0]) df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1]) df_linked_records.rename(columns={1: 'confidence'}, inplace=True) df_linked_records.drop(columns=[0], inplace=True) df_linked_records['cluster id'] = df_linked_records.index #For both dfa & dfb, add cluster id & confidence score from liked_records dfa.index.rename('dfa_link', inplace=True) dfa = dfa.merge(df_linked_records, on='dfa_link', how='left') dfb.index.rename('dfb_link', inplace=True) dfb = dfb.merge(df_linked_records, on='dfb_link', how='left') #Concatenate results from dfa + dfb df_final = dfa.append(dfb, ignore_index=True, sort=True) df_final = df_final.sort_values(by=['cluster id']) df_final = df_final.drop(columns=['dfa_link', 'dfb_link']) return df_final