def init_linker(data_fields, training_data_file, training_settings_file, doc_data, per_data, sample_size, training_size): if training_settings_file: try: with open(training_settings_file, 'rb') as f: linker = StaticRecordLink(f) log.info('Read settings from {}'.format(training_settings_file)) return linker except FileNotFoundError: pass linker = RecordLink(data_fields) if training_data_file: try: with open(training_data_file) as f: linker.readTraining(f) log.info('Read training data from {}'.format(training_data_file)) except FileNotFoundError: pass log.info('Generating training data') linker.sample(doc_data, per_data, sample_size=sample_size) linker.markPairs(trainingDataLink(doc_data, per_data, common_key='person', training_size=training_size)) linker.train() if training_data_file: log.info('Writing training data to {}'.format(training_data_file)) with open(training_data_file, 'w+') as fp: linker.writeTraining(fp) if training_settings_file: log.info('Writing settings data to {}'.format(training_settings_file)) with open(training_settings_file, 'wb+') as fp: linker.writeSettings(fp) return linker
except ValueError : clean_row['price'] = 0 data_d[filename + str(i)] = dict(clean_row) return data_d print 'importing data ...' data_1 = readData('AbtBuy_Abt.csv') data_2 = readData('AbtBuy_Buy.csv') # These data have already been linked by hand, and linked records # share the value of the field 'unique_id'. We are exploiting this work # to create training data. This function also assume that if all pairs of # records that DO NOT share a 'unique_id' value are distinct. training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000) # ## Training if os.path.exists(settings_file): print 'reading from', settings_file linker = dedupe.StaticRecordLink(settings_file) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'Zip' field. fields = { 'title': {'type': 'String'},
print('found duplicate') print(len(found_dupes)) print('precision') print(1 - len(false_positives) / float(len(found_dupes))) print('recall') print(len(true_positives) / float(len(true_dupes))) settings_file = 'canonical_data_matching_learned_settings' data_1, header = canonicalImport('tests/datasets/restaurant-1.csv') data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv') training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000) duplicates_s = set(frozenset(pair) for pair in training_pairs['match']) t0 = time.time() print('number of known duplicate pairs', len(duplicates_s)) if os.path.exists(settings_file): with open(settings_file, 'rb') as f: deduper = dedupe.StaticRecordLink(f) else: fields = [{ 'field': 'name', 'type': 'String' }, {
StructField("manufacturer", StringType(), True), StructField("family", StringType(), True), StructField("model", StringType(), True), ] products = sqlContext.read.json(PRODUCTS_PATH, StructType(product_fields))\ .fillna({'family': ''}) # replace nulls in family fields products_df, products_dict = canonical_format(products, Product) listings_df, listings_dict = canonical_format(listings, Listing) products_training_dict = json.load(open(PRODUCTS_TRAINING_PATH)) listings_training_dict = json.load(open(LISTINGS_TRAINING_PATH)) # train model gazetteer.sample(products_dict, listings_dict, 10000) training_pairs = trainingDataLink(products_training_dict, listings_training_dict, 'labelled_id', 10) gazetteer.markPairs(training_pairs) gazetteer.train() # add products to the index of records to match against if not gazetteer.blocked_records: gazetteer.index(products_dict) alpha = gazetteer.threshold(listings_dict, recall_weight=.5) # identify records that all refer to the same entity print('clustering...') clustered_dupes = gazetteer.match(listings_dict, threshold=alpha) debug(clustered_dupes)