print(len(found_dupes)) print('precision') print(1 - len(false_positives) / float(len(found_dupes))) print('recall') print(len(true_positives) / float(len(true_dupes))) settings_file = 'canonical_learned_settings.json' raw_data = 'tests/datasets/restaurant-nophone-training.csv' data_d, header = canonicalImport(raw_data) training_pairs = dedupe.trainingDataDedupe(data_d, 'unique_id', 5000) duplicates_s = set(frozenset(pair) for pair in training_pairs['match']) t0 = time.time() print('number of known duplicate pairs', len(duplicates_s)) if os.path.exists(settings_file): with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f, 1) else: fields = [{'field' : 'name', 'type': 'String'}, {'field' : 'name', 'type': 'Exact'}, {'field' : 'address', 'type': 'String'},
print('found duplicate') print(len(found_dupes)) print('precision') print(1 - len(false_positives) / float(len(found_dupes))) print('recall') print(len(true_positives) / float(len(true_dupes))) settings_file = 'canonical_learned_settings' raw_data = 'tests/datasets/restaurant-nophone-training.csv' data_d, header = canonicalImport(raw_data) training_pairs = dedupe.trainingDataDedupe(data_d, 'unique_id', 5000) duplicates = set() for _, pair in groupby(sorted(data_d.items(), key=lambda x: x[1]['unique_id']), key=lambda x: x[1]['unique_id']): pair = list(pair) if len(pair) == 2: a, b = pair duplicates.add(frozenset((a[0], b[0]))) t0 = time.time() print('number of known duplicate pairs', len(duplicates)) if os.path.exists(settings_file): with open(settings_file, 'rb') as f:
'type': 'String' }, { 'field': 'geometry', 'type': 'LatLong', 'has missing': True }, { 'field': 'country code', 'type': 'Exact', 'has missing': True }] commonField = 'geonameid' # Create labeled data trainingSize = int(0.8 * len(trainingData)) labeledData = dedupe.trainingDataDedupe(trainingData, commonField, training_size=trainingSize) # Create the matcher logger.info('Train using the labeled data') matcher = dedupe.Dedupe(fields) sampleSize = int(0.2 * len(trainingData)) matcher.sample(trainingData, sample_size=sampleSize) matcher.markPairs(labeledData) matcher.train() logger.info('Training finished') # When finished, save our training to disk trainingFile = arguments.train with open(trainingFile, 'w') as tf: matcher.writeTraining(tf)