def init_linker(data_fields, training_data_file, training_settings_file, doc_data, per_data, sample_size, training_size):
    if training_settings_file:
        try:
            with open(training_settings_file, 'rb') as f:
                linker = StaticRecordLink(f)
                log.info('Read settings from {}'.format(training_settings_file))
                return linker
        except FileNotFoundError:
            pass

    linker = RecordLink(data_fields)

    if training_data_file:
        try:
            with open(training_data_file) as f:
                linker.readTraining(f)
                log.info('Read training data from {}'.format(training_data_file))
        except FileNotFoundError:
            pass

    log.info('Generating training data')
    linker.sample(doc_data, per_data, sample_size=sample_size)
    linker.markPairs(trainingDataLink(doc_data, per_data, common_key='person', training_size=training_size))
    linker.train()

    if training_data_file:
        log.info('Writing training data to {}'.format(training_data_file))
        with open(training_data_file, 'w+') as fp:
            linker.writeTraining(fp)

    if training_settings_file:
        log.info('Writing settings data to {}'.format(training_settings_file))
        with open(training_settings_file, 'wb+') as fp:
            linker.writeSettings(fp)

    return linker
Example #2
0
            except ValueError :
                clean_row['price'] = 0
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

    
print 'importing data ...'
data_1 = readData('AbtBuy_Abt.csv')
data_2 = readData('AbtBuy_Buy.csv')

# These data have already been linked by hand, and linked records
# share the value of the field 'unique_id'. We are exploiting this work
# to create training data. This function also assume that if all pairs of
# records that DO NOT share a 'unique_id' value are distinct.
training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)


# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    linker = dedupe.StaticRecordLink(settings_file)

else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'Zip' field. 
    fields = {
        'title': {'type': 'String'},
Example #3
0
    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


settings_file = 'canonical_data_matching_learned_settings'

data_1, header = canonicalImport('tests/datasets/restaurant-1.csv')
data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv')

training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticRecordLink(f)
else:
    fields = [{
        'field': 'name',
        'type': 'String'
    }, {
Example #4
0
                      StructField("manufacturer", StringType(), True),
                      StructField("family", StringType(), True),
                      StructField("model", StringType(), True),
                    ]

    products = sqlContext.read.json(PRODUCTS_PATH, StructType(product_fields))\
                                .fillna({'family': ''}) # replace nulls in family fields

    products_df, products_dict = canonical_format(products, Product)
    listings_df, listings_dict = canonical_format(listings, Listing)

    products_training_dict = json.load(open(PRODUCTS_TRAINING_PATH))
    listings_training_dict = json.load(open(LISTINGS_TRAINING_PATH))
    # train model
    gazetteer.sample(products_dict, listings_dict, 10000)
    training_pairs = trainingDataLink(products_training_dict, listings_training_dict, 'labelled_id', 10)
    gazetteer.markPairs(training_pairs)
    gazetteer.train()

    # add products to the index of records to match against
    if not gazetteer.blocked_records:
        gazetteer.index(products_dict)

    alpha = gazetteer.threshold(listings_dict, recall_weight=.5)


    # identify records that all refer to the same entity
    print('clustering...')
    clustered_dupes = gazetteer.match(listings_dict, threshold=alpha)
    debug(clustered_dupes)