def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe", recall_weight=1, sample_size=0.3): """Deduplicates a dataframe given fields of interest. Parameters ---------- df : pd.DataFrame The dataframe to deduplicate. field_properties : list A list specifying what fields to use for deduplicating records. canonicalize : bool or list, default False Option that provides the canonical records as additional columns. Specifying a list of column names only canonicalizes those columns. config_name : str, default dedupe_dataframe The configuration file name. Note that this will be used as a prefix to save the settings and training files. recall_weight : int, default 1 Find the threshold that will maximize a weighted average of our precision and recall. When we set the recall weight to 2, we are saying we care twice as much about recall as we do precision. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. Returns ------- pd.DataFrame A pandas dataframe that contains the cluster id and confidence score. Optionally, it will contain canonicalized columns for all attributes of the record. """ # Import Data config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply( lambda x: dict(zip(df.columns, x.tolist())), axis=1) data_d = dict(zip(df.index, df.dictionary)) # train or load the model deduper = _train(settings_file, training_file, data_d, field_properties, sample_size) # ## Set threshold threshold = deduper.threshold(data_d, recall_weight=recall_weight) # cluster the records clustered_df = _cluster(deduper, data_d, threshold, canonicalize) results = df.join(clustered_df, how='left') results.drop(['dictionary'], axis=1, inplace=True) return results
def gazetteer_dataframe(clean_data, messy_data, field_properties, canonicalize=False, config_name="gazetteer_dataframe", update_model=False, threshold=0.3, sample_size=1, n_cores=None): """Deduplicates a dataframe given fields of interest. Parameters ---------- clean_data : pd.DataFrame The gazetteer dataframe. messy_data : pd.DataFrame The dataframe to deduplicate. field_properties : str A string specifying what fields to use for deduplicating records. canonicalize : bool or list, default False Option that provides the canonical records as additional columns. Specifying a list of column names only canonicalizes those columns. setting_file : str, default None. the default name of the setting file is dedupe_dataframe_settings if None is provided. training_file : str, default None the default name of the setting file is dedupe_dataframe_training.json if None is provided. Note: the name of the training file should include the .json extension. update_model : bool, default False If True, it allows user to update existing model by uploading training file. threshold : float, default 0.3 only consider put together records into clusters if the cophenetic similarity of the cluster is greater than the threshold. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. n_cores : int, default None Specify the number of cores to use during clustering. By default n_cores is equal to None (i.e. use multipressing equal to CPU count). Returns ------- pd.DataFrame A pandas dataframe that contains the cluster id and confidence score. Optionally, it will contain canonicalized columns for all attributes of the record. """ # Import Data config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('Importing data ...') assert type( clean_data ) == pd.core.frame.DataFrame, 'Please provide a gazette in pandas dataframe format' assert len( clean_data.columns ) == 1, 'Please provide a gazetteer dataframe made of a single variable' assert type(field_properties ) == str, 'field_properties must be in string (str) format' # Common column name common_name = clean_data.columns[0] # Canonical dataset (i.e. gazette) df_canonical = clean_punctuation(clean_data) df_canonical.rename(columns={field_properties: common_name}, inplace=True) specify_type(df_canonical, [common_name]) df_canonical['dictionary'] = df_canonical.apply( lambda x: dict(zip(df_canonical.columns, x.tolist())), axis=1) canonical = dict(zip(df_canonical.index, df_canonical.dictionary)) # Messy dataset df_messy = clean_punctuation(messy_data) df_messy.rename(columns={field_properties: common_name}, inplace=True) specify_type(df_messy, [common_name]) df_messy['dictionary'] = df_messy.apply( lambda x: dict(zip(df_messy.columns, x.tolist())), axis=1) messy = dict(zip(df_messy.index, df_messy.dictionary)) # Train or load the model deduper = _train(settings_file, training_file, canonical, messy, common_name, sample_size, update_model, n_cores) # Cluster the records clustered_df = _cluster(deduper, canonical, messy, threshold, canonicalize) results = messy_data.join(clustered_df, how='left') results.rename(columns={ 'canonical_' + str(common_name): 'canonical_' + str(field_properties) }, inplace=True) return results
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe", update_model=False, threshold=0.4, sample_size=0.3): """Deduplicates a dataframe given fields of interest. Parameters ---------- df : pd.DataFrame The dataframe to deduplicate. field_properties : list A list specifying what fields to use for deduplicating records. canonicalize : bool or list, default False Option that provides the canonical records as additional columns. Specifying a list of column names only canonicalizes those columns. config_name : str, default dedupe_dataframe The configuration file name. Note that this will be used as a prefix to save the settings and training files. update_model : bool, default False If True, it allows user to update existing model by uploading training file. threshold : float, default 0.4 Only put together records into clusters if the cophenetic similarity of the cluster is greater than the threshold. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. Returns ------- pd.DataFrame A pandas dataframe that contains the cluster id and confidence score. Optionally, it will contain canonicalized columns for all attributes of the record. """ # Import Data config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('Importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply(lambda x: dict(zip(df.columns, x.tolist())), axis=1) data_d = dict(zip(df.index, df.dictionary)) # Train or load the model deduper = _train(settings_file, training_file, data_d, field_properties, sample_size, update_model) # Cluster the records clustered_df = _cluster(deduper, data_d, threshold, canonicalize) results = df.join(clustered_df, how='left') results.drop(['dictionary'], axis=1, inplace=True) return results
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe", recall_weight=1, sample_size=0.3): """Deduplicates a dataframe given fields of interest. Parameters ---------- df : pd.DataFrame The dataframe to deduplicate. field_properties : list A list specifying what fields to use for deduplicating records. canonicalize : bool, default False Option that provides the canonical record as additional columns. config_name : str, default dedupe_dataframe The configuration file name. Note that this will be used as a prefixto save the settings and training files. recall_weight : int, default 1 Find the threshold that will maximize a weighted average of our precision and recall. When we set the recall weight to 2, we are saying we care twice as much about recall as we do precision. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. Returns ------- pd.DataFrame A pandas dataframe that contains the cluster id and confidence score. Optionally, it will contain canonicalized columns for all attributes of the record. """ # Import Data config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply( lambda x: dict(zip(df.columns, x.tolist())), axis=1) data_d = dict(zip(df.index, df.dictionary)) # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. sample_num = math.floor(len(df) * sample_size) deduper.sample(data_d, sample_num) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) # ## Set threshold threshold = deduper.threshold(data_d, recall_weight=recall_weight) # ## Clustering print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) # Convert data_d to string so that Price & LatLong won't get traceback # during dedupe.canonicalize() for i in data_d.values(): for key in i: if i[key] is None: pass else: i[key] = str(i[key]) # ## Writing Results cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } cluster_index = [] for i in cluster_membership.items(): cluster_index.append(i) # turn results into dataframe dfa = pd.DataFrame(cluster_index) dfa.rename(columns={0: 'Id'}, inplace=True) dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"]) dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"]) canonical_list = [] if canonicalize: for i in dfa[1][0]['canonical representation'].keys(): canonical_list.append(i) dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply( lambda x: x['canonical representation'][i]) elif type(canonicalize) == list: for i in canonicalize: dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply( lambda x: x['canonical representation'][i]) dfa.set_index('Id', inplace=True) df = df.join(dfa) df.drop(columns=[1, 'dictionary'], inplace=True) return df