Ejemplo n.º 1
0
def _active_learning(clean_data, messy_data, sample_size, deduper,
                     training_file, settings_file):
    """Internal method that trains the deduper model using active learning.
        Parameters
        ----------
        clean_data : dict
            The dictionary form of the gazette that gazetteer_dedupe requires.
        messy_data : dict
            The dictionary form of the messy data that needs to be deduplicated 
            (and canonicalized)
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        deduper : a gazetteer model instance
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        settings_file : str
            A path to a settings file that will be loaded if it exists.
            
        Returns
        -------
        dedupe.Gazetteer
            A trained gazetteer model instance.
    """
    # To train dedupe, we feed it a sample of records.
    sample_num = math.floor(len(messy_data) * sample_size)
    deduper.prepare_training(clean_data, messy_data, sample_size=sample_num)

    print('Starting active labeling...')

    dedupe.console_label(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)

    # Save our weights and predicates to disk.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

    return deduper
Ejemplo n.º 2
0
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                gazetteer.prepare_training(messy, canonical, training_file=tf)
        else:
            gazetteer.prepare_training(messy, canonical)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.console_label(gazetteer)

        gazetteer.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            gazetteer.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            gazetteer.write_settings(sf)

        gazetteer.cleanup_training()
Ejemplo n.º 3
0
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_d, f)
        else:
            deduper.prepare_training(data_d)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.console_label(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)
                linker.prepare_training(data_1,
                                        data_2,
                                        training_file=tf,
                                        sample_size=15000)
        else:
            linker.prepare_training(data_1, data_2, sample_size=15000)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.console_label(linker)

        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

    # ## Blocking
def run_dedupe(settings_file, training_file, type):
    start_time = time.time()
    print("dedupe file ", dedupe.__file__)
    print("dedupe path ", dedupe.__path__)
    print("dedupe name ", dedupe.__name__)
    print("dedupe spec ", dedupe.__spec__)
    print("dedupe doc ", dedupe.__doc__)
    print("dedupe loader ", dedupe.__loader__)
    # Set the database connection from environment variable using
    # [dj_database_url](https://github.com/kennethreitz/dj-database-url)
    # For example:
    #   export DATABASE_URL=postgres://user:password@host/mydatabase
    db_conf = dj_database_url.config()

    if not db_conf:
        raise Exception(
            'set DATABASE_URL environment variable with your connection, e.g. '
            'export DATABASE_URL=postgres://user:password@host/mydatabase')

    read_con = psycopg2.connect(database=db_conf['NAME'],
                                user=db_conf['USER'],
                                password=db_conf['PASSWORD'],
                                host=db_conf['HOST'],
                                port=db_conf['PORT'],
                                cursor_factory=psycopg2.extras.RealDictCursor)

    write_con = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'])

    # We'll be using variations on this following select statement to pull
    # in campaign donor info.
    if type == 'IND':
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person = 1 and name not like '%unitem%' "
    else:
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person != 1 and name not like '%unitem%' "

    # ## Training
    if os.path.exists(settings_file):
        print('reading from ', settings_file)
        with open(settings_file, 'rb') as sf:
            deduper = dedupe.StaticDedupe(sf, num_cores=1)
    else:
        # Define the fields dedupe will pay attention to
        #
        # The street, city, and zip fields are often missing, so we'll
        # tell dedupe that, and we'll learn a model that take that into
        # account
        fields = [
            {
                'field': 'name',
                'type': 'String'
            },
            {
                'field': 'street',
                'type': 'String',
                'has missing': True
            },
            {
                'field': 'city',
                'type': 'String',
                'has missing': True
            },
            {
                'field': 'state',
                'type': 'ShortString',
                'has missing': True
            },
            {
                'field': 'zip',
                'type': 'ShortString',
                'has missing': True
            },
        ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=1)

        # Named cursor runs server side with psycopg2
        with read_con.cursor('donor_select') as cur:
            cur.execute(DONOR_SELECT)
            temp_d = {i: row for i, row in enumerate(cur)}

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        #
        # __Note:__ if you want to train from
        # scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(temp_d, tf)
        else:
            deduper.prepare_training(temp_d)

        del temp_d

        # ## Active learning

        print('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.console_label(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Notice our argument here
        #
        # `recall` is the proportion of true dupes pairs that the learned
        # rules must cover. You may want to reduce this if your are making
        # too many blocks and too many comparisons.
        deduper.train(recall=0.90)

        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)

        # We can now remove some of the memory hogging objects we used
        # for training
        deduper.cleanup_training()

    print(f'deduper predicates: {deduper.predicates}')
    # ## Blocking
    print('blocking...')

    # To run blocking on such a large set of data, we create a separate table
    # that contains blocking keys and record ids
    print('creating blocking_map database')
    blocking_table = 'blocking_map_' + type
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS " + blocking_table)
            cur.execute("CREATE TABLE " + blocking_table + " "
                        "(block_key text, donor_id INTEGER)")

    # If dedupe learned a Index Predicate, we have to take a pass
    # through the data and create indices.
    print('creating inverted index')

    for field in deduper.fingerprinter.index_fields:
        with read_con.cursor('field_values') as cur:
            cur.execute("SELECT DISTINCT %s FROM processed_donors" % field)
            field_data = (row[field] for row in cur)
            deduper.fingerprinter.index(field_data, field)

    # Now we are ready to write our blocking map table by creating a
    # generator that yields unique `(block_key, donor_id)` tuples.
    print('writing blocking map')

    with read_con.cursor('donor_select') as read_cur:
        read_cur.execute(DONOR_SELECT)

        full_data = ((row['donor_id'], row) for row in read_cur)
        b_data = deduper.fingerprinter(full_data)

        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY ' + blocking_table +
                                      ' FROM STDIN WITH CSV',
                                      Readable(b_data),
                                      size=100000)

    # free up memory by removing indices
    deduper.fingerprinter.reset_indices()

    logging.info("indexing block_key")
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE UNIQUE INDEX ON " + blocking_table + " "
                        "(block_key text_pattern_ops, donor_id)")

    # ## Clustering
    entity_map_table = "entity_map_" + type
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS " + entity_map_table)

            print('creating entity_map database')

            cur.execute("CREATE TABLE " + entity_map_table + " "
                        "(donor_id INTEGER, canon_id INTEGER, "
                        " cluster_score FLOAT, PRIMARY KEY(donor_id))")

    read_con1 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    with read_con1.cursor(
            'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur:
        read_cur.execute(
            "SELECT a.donor_id, "
            "row_to_json((SELECT d FROM (SELECT a.city, "
            "a.name, "
            "a.zip, "
            "a.state, "
            "a.street) d)), "
            "b.donor_id, "
            "row_to_json((SELECT d FROM (SELECT b.city, "
            "b.name, "
            "b.zip, "
            "b.state, "
            "b.street) d)) "
            "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west "
            "FROM " + blocking_table + " AS l "
            "INNER JOIN " + blocking_table + " AS r "
            "USING (block_key) "
            "WHERE l.donor_id < r.donor_id) ids "
            "INNER JOIN processed_donors a ON ids.east=a.donor_id "
            "INNER JOIN processed_donors b ON ids.west=b.donor_id")

        print('clustering...')
        clustered_dupes = deduper.cluster(deduper.score(
            record_pairs(read_cur)),
                                          threshold=0.5)

        # ## Writing out results

        # We now have a sequence of tuples of donor ids that dedupe believes
        # all refer to the same entity. We write this out onto an entity map
        # table

        print('writing results')
        write_con1 = psycopg2.connect(database=db_conf['NAME'],
                                      user=db_conf['USER'],
                                      password=db_conf['PASSWORD'],
                                      host=db_conf['HOST'],
                                      port=db_conf['PORT'])
        with write_con1:
            with write_con1.cursor() as write_cur:
                write_cur.copy_expert('COPY ' + entity_map_table +
                                      ' FROM STDIN WITH CSV',
                                      Readable(cluster_ids(clustered_dupes)),
                                      size=100000)

    with write_con1:
        with write_con1.cursor() as cur:
            cur.execute("CREATE INDEX head_index_" + type + " ON " +
                        entity_map_table + " (canon_id)")

    # Print out the number of duplicates found

    # ## Payoff

    # With all this done, we can now begin to ask interesting questions
    # of the data
    #
    # For example, let's see who the top 10 donors are.

    locale.setlocale(locale.LC_ALL,
                     'en_US.UTF-8')  # for pretty printing numbers
    read_con2 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    # save entity map
    entity_map_filename = entity_map_table + "_" + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    donors_filename = 'processed_donors_' + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    with read_con2.cursor() as cur:
        with open(entity_map_filename, 'w') as file_out:
            cur.copy_expert(
                'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER',
                file_out)
        with open(donors_filename, 'w') as file_out:
            cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER',
                            file_out)

    if type == 'IND':
        top_donor_where = 'donors.corp is null'
    else:
        top_donor_where = 'donors.corp is not null'
    # Create a temporary table so each group and unmatched record has
    # a unique id
    with read_con2.cursor() as cur:
        cur.execute(
            "CREATE TEMPORARY TABLE e_map "
            "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id "
            "FROM " + entity_map_table + " "
            "RIGHT JOIN donors USING(donor_id)")

        cur.execute(
            "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "donation_totals.totals AS totals "
            "FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors INNER JOIN "
            "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            " FROM contributions INNER JOIN e_map "
            " USING (donor_id) "
            " GROUP BY (canon_id) "
            " ORDER BY totals "
            " DESC) "
            "AS donation_totals ON donors.donor_id=donation_totals.canon_id "
            "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10"
        )

        print("Top Donors (deduped)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        # Compare this to what we would have gotten if we hadn't done any
        # deduplication

        cur.execute(
            "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            "FROM donors INNER JOIN contributions "
            "USING (donor_id) "
            "WHERE " + top_donor_where + " "
            "GROUP BY (donor_id) "
            "ORDER BY totals DESC) AS t where totals IS NOT NULL "
            "LIMIT 10")

        print("Top Donors (raw)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        cur.execute(
            " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors "
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC LIMIT 10")
        # ##Print stats that are saved to match_runs table
        # Biggest cluster first
        print("Biggest Clusters")
        biggest_name = ''
        for row in cur:
            print('%(cluster_size)20s: %(name)s' % row)
            if biggest_name == '':
                biggest_name = row['name'] + ':' + str(row['cluster_id'])
        # Then total of donors
        if type == 'IND':
            processed_donors_where = '= 1'
        else:
            processed_donors_where = '!= 1'
        cur.execute(
            " SELECT count(*) AS num_donors FROM processed_donors WHERE person "
            + processed_donors_where)
        for row in cur:
            number_of_donors = row['num_donors']
        # Then max, average and number of clusters
        cur.execute(
            " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters "
            " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors"
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC) AS stats")
        # Print the stats
        print("stats...")
        print(f"total number of donors: {number_of_donors}")
        for row in cur:
            print(
                'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s'
                % row)
            biggest_size = row['biggest']
            average_size = row['average']
            number_of_clusters = row['number_of_clusters']
    # write to the match_run table
    runtime = time.time() - start_time
    donor_cluster_ratio = number_of_donors / number_of_clusters
    with write_con1.cursor() as cur:
        cur.execute(
            """ 
            INSERT INTO match_runs 
            (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster,
             total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file)
            VALUES (NOW(), %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s) """,
            (' '.join(str(pred)
                      for pred in deduper.predicates), number_of_clusters,
             average_size, biggest_size, biggest_name, number_of_donors, type,
             runtime, donor_cluster_ratio, settings_file))
    write_con1.commit()

    read_con.close()
    write_con.close()
    read_con1.close()
    write_con1.close()
    read_con2.close()

    print('ran in', runtime, 'seconds')
Ejemplo n.º 6
0
def link_dataframes(dfa,
                    dfb,
                    field_properties,
                    config_name="link_dataframes",
                    n_cores=None):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('Importing data ...')

    dfa = clean_punctuation(dfa)
    specify_type(dfa, field_properties)

    dfa['index_field'] = dfa.index
    dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x))
    dfa.set_index(['index_field'], inplace=True)

    data_1 = dfa.to_dict(orient='index')

    dfb = clean_punctuation(dfb)
    specify_type(dfb, field_properties)

    dfb['index_field'] = dfb.index
    dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x))
    dfb.set_index(['index_field'], inplace=True)

    data_2 = dfb.to_dict(orient='index')
    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('Reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf, num_cores=n_cores)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields, num_cores=n_cores)
        # To train the linker, we feed it a sample of records.
        linker.prepare_training(data_1, data_2, sample_size=15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('Reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.prepare_training(data, training_file=tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('Starting active labeling...')

        dedupe.console_label(linker)
        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

    # ## Blocking

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our
    # precision and recall.  When we set the recall weight to 2, we are
    # saying we care twice as much about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    print('Clustering...')
    linked_records = linker.join(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))

    #Convert linked records into dataframe
    df_linked_records = pd.DataFrame(linked_records)

    df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0])
    df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1])
    df_linked_records.rename(columns={1: 'confidence'}, inplace=True)
    df_linked_records.drop(columns=[0], inplace=True)
    df_linked_records['cluster id'] = df_linked_records.index

    #For both dfa & dfb, add cluster id & confidence score from liked_records
    dfa.index.rename('dfa_link', inplace=True)
    dfa = dfa.merge(df_linked_records, on='dfa_link', how='left')

    dfb.index.rename('dfb_link', inplace=True)
    dfb = dfb.merge(df_linked_records, on='dfb_link', how='left')

    #Concatenate results from dfa + dfb
    df_final = dfa.append(dfb, ignore_index=True, sort=True)
    df_final = df_final.sort_values(by=['cluster id'])
    df_final = df_final.drop(columns=['dfa_link', 'dfb_link'])

    return df_final