Esempio n. 1
0
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


settings_file = 'canonical_learned_settings.json'
raw_data = 'tests/datasets/restaurant-nophone-training.csv'

data_d, header = canonicalImport(raw_data)

training_pairs = dedupe.trainingDataDedupe(data_d, 
                                           'unique_id', 
                                           5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)
else:
    fields = [{'field' : 'name', 'type': 'String'},
              {'field' : 'name', 'type': 'Exact'},
              {'field' : 'address', 'type': 'String'},
Esempio n. 2
0
    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


settings_file = 'canonical_learned_settings'
raw_data = 'tests/datasets/restaurant-nophone-training.csv'

data_d, header = canonicalImport(raw_data)

training_pairs = dedupe.trainingDataDedupe(data_d, 'unique_id', 5000)

duplicates = set()
for _, pair in groupby(sorted(data_d.items(), key=lambda x: x[1]['unique_id']),
                       key=lambda x: x[1]['unique_id']):
    pair = list(pair)
    if len(pair) == 2:
        a, b = pair
        duplicates.add(frozenset((a[0], b[0])))

t0 = time.time()

print('number of known duplicate pairs', len(duplicates))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        'type': 'String'
    }, {
        'field': 'geometry',
        'type': 'LatLong',
        'has missing': True
    }, {
        'field': 'country code',
        'type': 'Exact',
        'has missing': True
    }]
    commonField = 'geonameid'

    # Create labeled data
    trainingSize = int(0.8 * len(trainingData))
    labeledData = dedupe.trainingDataDedupe(trainingData,
                                            commonField,
                                            training_size=trainingSize)

    # Create the matcher
    logger.info('Train using the labeled data')
    matcher = dedupe.Dedupe(fields)
    sampleSize = int(0.2 * len(trainingData))
    matcher.sample(trainingData, sample_size=sampleSize)
    matcher.markPairs(labeledData)
    matcher.train()
    logger.info('Training finished')

    # When finished, save our training to disk
    trainingFile = arguments.train
    with open(trainingFile, 'w') as tf:
        matcher.writeTraining(tf)