def dedupe_dataframe(df, field_properties, canonicalize=False,
                     config_name="dedupe_dataframe", recall_weight=1,
                     sample_size=0.3):
    """Deduplicates a dataframe given fields of interest.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : list
            A list specifying what fields to use for deduplicating records.
        canonicalize : bool or list, default False
            Option that provides the canonical records as additional columns.
            Specifying a list of column names only canonicalizes those columns.
        config_name : str, default dedupe_dataframe
            The configuration file name. Note that this will be used as 
            a prefix to save the settings and training files.
        recall_weight : int, default 1
            Find the threshold that will maximize a weighted average of our
            precision and recall.  When we set the recall weight to 2, we are
            saying we care twice as much about recall as we do precision.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.

    """
    # Import Data  
    config_name = config_name.replace(" ", "_")
   
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(
        lambda x: dict(zip(df.columns, x.tolist())), axis=1)
    data_d = dict(zip(df.index, df.dictionary))
    
    # train or load the model
    deduper = _train(settings_file, training_file, data_d, field_properties,
                     sample_size)

    # ## Set threshold
    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    # cluster the records
    clustered_df = _cluster(deduper, data_d, threshold, canonicalize)
    results = df.join(clustered_df, how='left')
    results.drop(['dictionary'], axis=1, inplace=True)

    return results
Esempio n. 2
0
def gazetteer_dataframe(clean_data,
                        messy_data,
                        field_properties,
                        canonicalize=False,
                        config_name="gazetteer_dataframe",
                        update_model=False,
                        threshold=0.3,
                        sample_size=1,
                        n_cores=None):
    """Deduplicates a dataframe given fields of interest.
        Parameters
        ----------
        clean_data : pd.DataFrame
            The gazetteer dataframe.
        messy_data : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : str
            A string specifying what fields to use for deduplicating records.
        canonicalize : bool or list, default False
            Option that provides the canonical records as additional columns.
            Specifying a list of column names only canonicalizes those columns.
        setting_file : str, default None.
            the default name of the setting file is dedupe_dataframe_settings if None is provided.
        training_file : str, default None
            the default name of the setting file is dedupe_dataframe_training.json if None is provided.
            Note: the name of the training file should include the .json extension.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file. 
        threshold : float, default 0.3
           only consider put together records into clusters if the cophenetic similarity of the cluster 
           is greater than the threshold.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        n_cores : int, default None
            Specify the number of cores to use during clustering.
            By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.
    """
    # Import Data
    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('Importing data ...')
    assert type(
        clean_data
    ) == pd.core.frame.DataFrame, 'Please provide a gazette in pandas dataframe format'
    assert len(
        clean_data.columns
    ) == 1, 'Please provide a gazetteer dataframe made of a single variable'
    assert type(field_properties
                ) == str, 'field_properties must be in string (str) format'

    # Common column name
    common_name = clean_data.columns[0]

    # Canonical dataset (i.e. gazette)
    df_canonical = clean_punctuation(clean_data)
    df_canonical.rename(columns={field_properties: common_name}, inplace=True)
    specify_type(df_canonical, [common_name])

    df_canonical['dictionary'] = df_canonical.apply(
        lambda x: dict(zip(df_canonical.columns, x.tolist())), axis=1)
    canonical = dict(zip(df_canonical.index, df_canonical.dictionary))

    # Messy dataset
    df_messy = clean_punctuation(messy_data)
    df_messy.rename(columns={field_properties: common_name}, inplace=True)
    specify_type(df_messy, [common_name])

    df_messy['dictionary'] = df_messy.apply(
        lambda x: dict(zip(df_messy.columns, x.tolist())), axis=1)
    messy = dict(zip(df_messy.index, df_messy.dictionary))

    # Train or load the model
    deduper = _train(settings_file, training_file, canonical, messy,
                     common_name, sample_size, update_model, n_cores)

    # Cluster the records
    clustered_df = _cluster(deduper, canonical, messy, threshold, canonicalize)
    results = messy_data.join(clustered_df, how='left')
    results.rename(columns={
        'canonical_' + str(common_name):
        'canonical_' + str(field_properties)
    },
                   inplace=True)

    return results
Esempio n. 3
0
def dedupe_dataframe(df,
                     field_properties,
                     canonicalize=False,
                     config_name="dedupe_dataframe",
                     update_model=False,
                     threshold=0.4,
                     sample_size=0.3):
    """Deduplicates a dataframe given fields of interest.
        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : list
            A list specifying what fields to use for deduplicating records.
        canonicalize : bool or list, default False
            Option that provides the canonical records as additional columns.
            Specifying a list of column names only canonicalizes those columns.
        config_name : str, default dedupe_dataframe
            The configuration file name. Note that this will be used as 
            a prefix to save the settings and training files.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file. 
        threshold : float, default 0.4
           Only put together records into clusters if the cophenetic similarity of the cluster 
           is greater than the threshold.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.
    """
    # Import Data
    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('Importing data ...')

    df = clean_punctuation(df)

    specify_type(df, field_properties)

    df['dictionary'] = df.apply(lambda x: dict(zip(df.columns, x.tolist())),
                                axis=1)
    data_d = dict(zip(df.index, df.dictionary))

    # Train or load the model
    deduper = _train(settings_file, training_file, data_d, field_properties,
                     sample_size, update_model)

    # Cluster the records
    clustered_df = _cluster(deduper, data_d, threshold, canonicalize)
    results = df.join(clustered_df, how='left')
    results.drop(['dictionary'], axis=1, inplace=True)

    return results
def dedupe_dataframe(df, field_properties, canonicalize=False,
                     config_name="dedupe_dataframe", recall_weight=1,
                     sample_size=0.3):
    """Deduplicates a dataframe given fields of interest.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : list
            A list specifying what fields to use for deduplicating records.
        canonicalize : bool, default False
            Option that provides the canonical record as additional columns.
        config_name : str, default dedupe_dataframe
            The configuration file name. Note that this will be used as 
            a prefixto save the settings and training files.
        recall_weight : int, default 1
            Find the threshold that will maximize a weighted average of our
            precision and recall.  When we set the recall weight to 2, we are
            saying we care twice as much about recall as we do precision.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.

    """
    # Import Data  
    config_name = config_name.replace(" ", "_")
   
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(
        lambda x: dict(zip(df.columns, x.tolist())), axis=1)
    data_d = dict(zip(df.index, df.dictionary))
    
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        
        fields = []
        select_fields(fields, field_properties)

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        sample_num = math.floor(len(df) * sample_size)
        deduper.sample(data_d, sample_num)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    # ## Set threshold
    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    # ## Clustering
    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback
    # during dedupe.canonicalize()
    for i in data_d.values():
        for key in i:
            if i[key] is None:
                pass
            else:
                i[key] = str(i[key])
            
    # ## Writing Results
    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    cluster_index = []
    for i in cluster_membership.items():
        cluster_index.append(i)

    # turn results into dataframe
    dfa = pd.DataFrame(cluster_index)
    dfa.rename(columns={0: 'Id'}, inplace=True)
    
    dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"])
    dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"])

    canonical_list = []
    
    if canonicalize:
        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])
    elif type(canonicalize) == list:
        for i in canonicalize:
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])

    dfa.set_index('Id', inplace=True)
    df = df.join(dfa)            
    df.drop(columns=[1, 'dictionary'], inplace=True)

    return df