def _train_gazetteer( data_1, data_2, fields=DEDUPE_FIELDS, training_file=None, manual_labelling=False, pretty_print=False, ): LOGGER.info("training gazetteer with fields: %r", fields) gazetteer = dedupe.Gazetteer(fields) if training_file and smart_exists(training_file): LOGGER.info("reading existing training from <%s>", training_file) with open(training_file) as file_obj: gazetteer.prepare_training(data_1=data_1, data_2=data_2, training_file=file_obj, sample_size=50_000) else: gazetteer.prepare_training(data_1=data_1, data_2=data_2, sample_size=50_000) if manual_labelling: LOGGER.info("start interactive labelling") dedupe.convenience.console_label(gazetteer) if training_file: LOGGER.info("write training data back to <%s>", training_file) with open(training_file, "w") as file_obj: # bug in dedupe preventing training from being serialized correctly # gazetteer.write_training(file_obj) if pretty_print: _write_training(gazetteer, file_obj, sort_keys=True, indent=4) else: _write_training(gazetteer, file_obj) # if pretty_print: # with open(training_file) as file_obj: # training = parse_json(file_obj) # with open(training_file, "w") as file_obj: # serialize_json(obj=training, file=file_obj, sort_keys=True, indent=4) LOGGER.info("done labelling, begin training") gazetteer.train(recall=0.9, index_predicates=True) gazetteer.cleanup_training() return gazetteer
def train_gazetteer(messy, canonical, model_settings=None, should_index=False): """ Train and return a dedupe.Gazetteer using the specified messy and canonical dictionaries. The messy and canonical objects should have the same structure: - The key is a unique ID - The value is another dictionary of field:value pairs. This dictionary must contain at least 'country', 'name', and 'address' keys. Reads a training.json file containing positive and negative matches. """ if model_settings: gazetteer = dedupe.StaticGazetteer(model_settings) else: fields = [ { 'field': 'country', 'type': 'Exact' }, { 'field': 'name', 'type': 'String' }, { 'field': 'address', 'type': 'String' }, ] gazetteer = dedupe.Gazetteer(fields) gazetteer.sample(messy, canonical, 15000) training_file = os.path.join(settings.BASE_DIR, 'api', 'data', 'training.json') with open(training_file) as tf: gazetteer.readTraining(tf) gazetteer.train() gazetteer.cleanupTraining() if should_index: index_start = datetime.now() logger.info('Indexing started') gazetteer.index(canonical) index_duration = datetime.now() - index_start logger.info('Indexing finished ({})'.format(index_duration)) logger.info('Cleanup training') return gazetteer
def _train_gazetteer( data_1, data_2, fields=DEDUPE_FIELDS, training_file=None, manual_labelling=False, pretty_print=False, ): LOGGER.info("training gazetteer with fields: %r", fields) gazetteer = dedupe.Gazetteer(fields) gazetteer.sample(data_1, data_2, 50_000) if training_file and smart_exists(training_file): LOGGER.info("reading existing training from <%s>", training_file) with smart_open(training_file, "r") as file_obj: gazetteer.readTraining(file_obj) if manual_labelling: LOGGER.info("start interactive labelling") dedupe.convenience.consoleLabel(gazetteer) if training_file: LOGGER.info("write training data back to <%s>", training_file) with smart_open(training_file, "w") as file_obj: gazetteer.writeTraining(file_obj) if pretty_print: with smart_open(training_file, "r") as file_obj: training = parse_json(file_obj) with smart_open(training_file, "w") as file_obj: serialize_json(obj=training, file=file_obj, sort_keys=True, indent=4) LOGGER.info("done labelling, begin training") gazetteer.train(recall=0.9, index_predicates=True) gazetteer.cleanupTraining() return gazetteer
'field': 'title', 'type': 'Text', 'corpus': descriptions() }, { 'field': 'description', 'type': 'Text', 'has missing': True, 'corpus': descriptions() }, { 'field': 'price', 'type': 'Price', 'has missing': True }] # Create a new gazetteer object and pass our data model to it. gazetteer = dedupe.Gazetteer(fields) # If we have training data saved from a previous run of gazetteer, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: gazetteer.prepare_training(messy, canonical, training_file=tf) else: gazetteer.prepare_training(messy, canonical) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not.
if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: gazetteer = dedupe.StaticGazetteer(sf) else: fields = [{'field' : 'name', 'type': 'String'}, {'field' : 'address', 'type': 'String', 'has missing' : True}, {'field' : 'city', 'type': 'String', 'has missing' : True}, {'field' : 'state', 'type': 'Exact', 'has missing': True}, {'field' : 'zip', 'type': 'ShortString', 'has missing' : True}, {'field' : 'phone', 'type': 'ShortString', 'has missing' : True}, {'field' : 'country', 'type': 'Exact', 'has missing' : True} ] # Create a new gazetteer object and pass our data model to it. gazetteer = dedupe.Gazetteer(fields, num_cores=2) gazetteer.markPairs(labeled_examples) print("labelled pairs are loaded from database") #print("Prepair for training using sample data") gazetteer.prepare_training(messy_data,canonical_data, blocked_proportion=0.6) # gazetteer.sample(messy_erp, canonical_sfdc, 15000) # deduper.prepare_training(temp_d, training_file=tf) #del messy_erp # ## Active learning dedupe.consoleLabel(gazetteer) gazetteer.train(index_predicates=True) # When finished, save our training away to disk
def gazetteer_dataframes(messy_df, canonical_df, field_properties, recall_weight, n_matches, config_name="gazetteer_dataframes"): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') messy_df = clean_punctuation(messy_df) specify_type(messy_df, field_properties) messy_df['index_field'] = messy_df.index messy_df['index_field'] = messy_df['index_field'].apply( lambda x: "messy_df" + str(x)) messy_df.set_index(['index_field'], inplace=True) data_1 = messy_df.to_dict(orient='index') canonical_df = clean_punctuation(canonical_df) specify_type(canonical_df, field_properties) canonical_df['index_field'] = canonical_df.index canonical_df['index_field'] = canonical_df['index_field'].apply( lambda x: "canonical_df" + str(x)) canonical_df.set_index(['index_field'], inplace=True) data_2 = canonical_df.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: gazetteer = dedupe.StaticGazetteer(sf) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new gazetteer object and pass our data model to it. gazetteer = dedupe.Gazetteer(fields) # To train the gazetteer, we feed it a sample of records. gazetteer.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: linker.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(gazetteer) gazetteer.train() # When finished, save our training awak to disk with open(training_file, 'w') as tf: gazetteer.writeTraining(tf) # Make the canonical set gazetteer.index(data_2) # Save our weights and predicates to disk. If the settings file exists, # we will skip all training and learning next time we run this file. with open(settings_file, 'wb') as sf: gazetteer.writeSettings(sf, index=True) gazetteer.cleanupTraining() gazetteer.index(data_2) #Calc threshold print('start calculating threshold') threshold = gazetteer.threshold(data_1, recall_weight) print('Threshold: {}'.format(threshold)) results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches) results_df = pd.DataFrame(results) results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0]) results_df['messy_df_link'] = results_df['messy_df_link'].str.strip( 'messy_df') results_df['messy_df_link'] = results_df['messy_df_link'].astype(int) results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1]) results_df['canonical_df_link'] = results_df[ 'canonical_df_link'].str.strip('canonical_df') results_df['canonical_df_link'] = results_df['canonical_df_link'].astype( int) results_df['confidence'] = results_df[0].apply(lambda x: x[1]) results_df['cluster id'] = results_df.index results_df = results_df.rename(columns={0: 'results'}) results_df['results'] = results_df['results'].astype(str) #For both messy_df & canonical_df, add cluster id & confidence score from results_df messy_df.index.rename('messy_df_link', inplace=True) messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'}) messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left') canonical_df.index.rename('canonical_df_link', inplace=True) canonical_df = canonical_df.rename( columns={'unique_id': 'canonical_unique_id'}) canonical_df = canonical_df.merge(results_df_copy, on='canonical_df_link', how='left') #Merge messy_df & canonical_df together final_df = messy_df.merge(canonical_df, on='results') return final_df
def getMatchingReady(session_id): addRowHash(session_id) cleanupTables(session_id) engine = worker_session.bind with engine.begin() as conn: conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\ .format(session_id)) conn.execute(''' CREATE TABLE "match_blocks_{0}" ( block_key VARCHAR, record_id BIGINT ) '''.format(session_id)) sess = worker_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) # Save Gazetteer settings d = dedupe.Gazetteer(field_defs) # Disabling canopy based predicates for now for definition in d.data_model.primary_fields: for idx, predicate in enumerate(definition.predicates): if predicate.type == 'TfidfPredicate': definition.predicates.pop(idx) d.readTraining(StringIO(sess.training_data)) d.train() g_settings = StringIO() d.writeSettings(g_settings) g_settings.seek(0) sess.gaz_settings_file = g_settings.getvalue() worker_session.add(sess) worker_session.commit() # Write match_block table model_fields = list(set([f['field'] for f in field_defs])) fields = ', '.join(['p.{0}'.format(f) for f in model_fields]) sel = ''' SELECT p.record_id, {0} FROM "processed_{1}" AS p LEFT JOIN "exact_match_{1}" AS e ON p.record_id = e.match WHERE e.record_id IS NULL; '''.format(fields, session_id) conn = engine.connect() rows = conn.execute(sel) data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \ for row in rows) block_gen = d.blocker(data) s = StringIO() writer = UnicodeCSVWriter(s) writer.writerows(block_gen) conn.close() s.seek(0) conn = engine.raw_connection() curs = conn.cursor() try: curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\ .format(session_id), s) conn.commit() except Exception, e: # pragma: no cover conn.rollback() raise e
def _train(settings_file, training_file, clean_data, messy_data, field_properties, sample_size, update_model, n_cores): """Internal method that trains the deduper model from scratch or update an existing dedupe model. Parameters ---------- settings_file : str A path to a settings file that will be loaded if it exists. training_file : str A path to a training file that will be loaded to keep training from. clean_data : dict The dictionary form of the gazette that gazetteer_dedupe requires. messy_data : dict The dictionary form of the messy data that needs to be deduplicated (and canonicalized) field_properties : dict The mapping of fields to their respective data types. Please see the dedupe documentation for further details. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. update_model : bool, default False If True, it allows user to update existing model by uploading training file. n_cores : int, default None Specify the number of cores to use during clustering. By default n_cores is equal to None (i.e. use multipressing equal to CPU count). Returns ------- dedupe.Gazetteer A gazetteer model instance. """ # Define the fields dedupe will pay attention to fields = [] select_fields(fields, [field_properties]) if update_model == False: # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticGazetteer(f, num_cores=n_cores) #Create a new deduper object and pass our data model to it. else: # Initialise dedupe deduper = dedupe.Gazetteer(fields, num_cores=n_cores) # Launch active learning deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) else: # ## Training # Initialise dedupe deduper = dedupe.Gazetteer(fields, num_cores=n_cores) # Import existing model print('Reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(clean_data, messy_data, training_file=f) # Launch active learning deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) return deduper
def setUp(self): self.deduper = dedupe.Gazetteer([{'field': 'name', 'type': 'String'}]) self.sSND = dedupe.training.semiSupervisedNonDuplicates