Python trainingDataLink Examples

Programming Language: Python

Namespace/Package Name: dedupe

Method/Function: trainingDataLink

Examples at hotexamples.com: 4

Python trainingDataLink - 4 examples found. These are the top rated real world Python examples of dedupe.trainingDataLink extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: person_record_linkage.py Project: SemanticComputing/warsa-linkers

def init_linker(data_fields, training_data_file, training_settings_file, doc_data, per_data, sample_size, training_size):
    if training_settings_file:
        try:
            with open(training_settings_file, 'rb') as f:
                linker = StaticRecordLink(f)
                log.info('Read settings from {}'.format(training_settings_file))
                return linker
        except FileNotFoundError:
            pass

    linker = RecordLink(data_fields)

    if training_data_file:
        try:
            with open(training_data_file) as f:
                linker.readTraining(f)
                log.info('Read training data from {}'.format(training_data_file))
        except FileNotFoundError:
            pass

    log.info('Generating training data')
    linker.sample(doc_data, per_data, sample_size=sample_size)
    linker.markPairs(trainingDataLink(doc_data, per_data, common_key='person', training_size=training_size))
    linker.train()

    if training_data_file:
        log.info('Writing training data to {}'.format(training_data_file))
        with open(training_data_file, 'w+') as fp:
            linker.writeTraining(fp)

    if training_settings_file:
        log.info('Writing settings data to {}'.format(training_settings_file))
        with open(training_settings_file, 'wb+') as fp:
            linker.writeSettings(fp)

    return linker

Example #2

Show file

File: record_linkage_example.py Project: cojito/dedupe

            except ValueError :
                clean_row['price'] = 0
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

    
print 'importing data ...'
data_1 = readData('AbtBuy_Abt.csv')
data_2 = readData('AbtBuy_Buy.csv')

# These data have already been linked by hand, and linked records
# share the value of the field 'unique_id'. We are exploiting this work
# to create training data. This function also assume that if all pairs of
# records that DO NOT share a 'unique_id' value are distinct.
training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)


# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    linker = dedupe.StaticRecordLink(settings_file)

else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'Zip' field. 
    fields = {
        'title': {'type': 'String'},

Example #3

Show file

    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))

    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


settings_file = 'canonical_data_matching_learned_settings'

data_1, header = canonicalImport('tests/datasets/restaurant-1.csv')
data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv')

training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticRecordLink(f)
else:
    fields = [{
        'field': 'name',
        'type': 'String'
    }, {

Example #4

Show file

File: match.py Project: yuguang/sortable

                      StructField("manufacturer", StringType(), True),
                      StructField("family", StringType(), True),
                      StructField("model", StringType(), True),
                    ]

    products = sqlContext.read.json(PRODUCTS_PATH, StructType(product_fields))\
                                .fillna({'family': ''}) # replace nulls in family fields

    products_df, products_dict = canonical_format(products, Product)
    listings_df, listings_dict = canonical_format(listings, Listing)

    products_training_dict = json.load(open(PRODUCTS_TRAINING_PATH))
    listings_training_dict = json.load(open(LISTINGS_TRAINING_PATH))
    # train model
    gazetteer.sample(products_dict, listings_dict, 10000)
    training_pairs = trainingDataLink(products_training_dict, listings_training_dict, 'labelled_id', 10)
    gazetteer.markPairs(training_pairs)
    gazetteer.train()

    # add products to the index of records to match against
    if not gazetteer.blocked_records:
        gazetteer.index(products_dict)

    alpha = gazetteer.threshold(listings_dict, recall_weight=.5)


    # identify records that all refer to the same entity
    print('clustering...')
    clustered_dupes = gazetteer.match(listings_dict, threshold=alpha)
    debug(clustered_dupes)