Python SyntheticDatabase.add Examples

Programming Language: Python

Namespace/Package Name: database

Method/Function: add

Examples at hotexamples.com: 2

Python SyntheticDatabase.add - 2 examples found. These are the top rated real world Python examples of database.SyntheticDatabase.add extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SyntheticDatabase(9)

corrupt(4)

add(2)

sample_and_remove(2)

plot(1)

Example #1

Show file

 def test_synthetic_add(self):
     synthetic = SyntheticDatabase(10, 10, number_features=3)  # 100 records, 10 entities, 10 records each
     self.assertEqual(len(synthetic.database.records), 100)
     synthetic.add(5, 10)
     self.assertEqual(len(synthetic.database.records), 150)

Example #2

Show file

File: experiments.py Project: mbarnes1/entity_resolution

def synthetic_sizes():
    """
    Sizes experiment here
    """
    resolution = 88
    number_features = 10
    number_entities = np.linspace(10, 100, num=resolution)
    number_entities = number_entities.astype(int)
    records_per_entity = 10
    #train_database_size = 100
    train_class_balance = 0.5
    #validation_database_size = 100
    corruption_multiplier = .001

    databases = list()
    db = SyntheticDatabase(number_entities[0], records_per_entity, number_features=number_features)
    databases.append(deepcopy(db))
    add_entities = [x - number_entities[i - 1] for i, x in enumerate(number_entities)][1:]
    for add in add_entities:
        db.add(add, records_per_entity)
        databases.append(deepcopy(db))
    corruption = np.random.normal(loc=0.0, scale=1.0, size=[number_entities[-1]*records_per_entity, number_features])
    train = deepcopy(databases[0])
    validation = deepcopy(databases[0])
    train.corrupt(corruption_multiplier*np.random.normal(loc=0.0, scale=1.0, size=[len(train.database.records), number_features]))
    validation.corrupt(corruption_multiplier*np.random.normal(loc=0.0, scale=1.0, size=[len(train.database.records), number_features]))
    for db in databases:
        db.corrupt(corruption_multiplier*corruption[:len(db.database.records), :])
    er = EntityResolution()
    train_pair_seed = generate_pair_seed(train.database, train.labels, train_class_balance)
    weak_match_function = LogisticMatchFunction(train.database, train.labels, train_pair_seed, 0.5)
    ROC = weak_match_function.test(validation.database, validation.labels, 0.5)
    #ROC.make_plot()

    ## Optimize ER on small dataset
    thresholds = np.linspace(0, 1.0, 10)
    metrics_list = list()
    #new_metrics_list = list()
    pairwise_precision = list()
    pairwise_recall = list()
    pairwise_f1 = list()
    for threshold in thresholds:
        weak_match_function.set_decision_threshold(threshold)
        labels_pred = er.run(deepcopy(databases[0].database), weak_match_function, single_block=True,
                             max_block_size=np.Inf, cores=1)
        met = Metrics(databases[0].labels, labels_pred)
        metrics_list.append(met)
        pairwise_precision.append(met.pairwise_precision)
        pairwise_recall.append(met.pairwise_recall)
        pairwise_f1.append(met.pairwise_f1)
        #class_balance_test = get_pairwise_class_balance(databases[0].labels)
        #new_metrics_list.append(NewMetrics(databases[0].database, er, class_balance_test))
    plt.plot(thresholds, pairwise_precision, label='Precision')
    plt.plot(thresholds, pairwise_recall, label='Recall')
    plt.plot(thresholds, pairwise_f1, label='F1')
    plt.xlabel('Threshold')
    plt.legend()
    plt.ylabel('Score')
    plt.title('Optimizing ER on small dataset')
    #i = np.argmax(np.array(pairwise_f1))
    #small_optimal_threshold = thresholds[i]  # optimize this
    small_optimal_threshold = 0.6
    print 'Optimal small threshold set at =', small_optimal_threshold
    plt.show()

    ## Possible score by optimizing on larger dataset
    metrics_list = list()
    pairwise_precision = list()
    pairwise_recall = list()
    pairwise_f1 = list()
    thresholds_largedataset = np.linspace(0.6, 1.0, 8)
    precision_lower_bound = list()
    recall_lower_bound = list()
    f1_lower_bound = list()
    for threshold in thresholds_largedataset:
        weak_match_function.set_decision_threshold(threshold)
        labels_pred = er.run(deepcopy(databases[-1].database), weak_match_function, single_block=True,
                             max_block_size=np.Inf, cores=1)
        met = Metrics(databases[-1].labels, labels_pred)
        metrics_list.append(met)
        pairwise_precision.append(met.pairwise_precision)
        pairwise_recall.append(met.pairwise_recall)
        pairwise_f1.append(met.pairwise_f1)
        class_balance_test = count_pairwise_class_balance(databases[-1].labels)
        new_metric = NewMetrics(databases[-1].database, labels_pred, weak_match_function, class_balance_test)
        precision_lower_bound.append(new_metric.precision_lower_bound)
        recall_lower_bound.append(new_metric.recall_lower_bound)
        f1_lower_bound.append(new_metric.f1_lower_bound)
    plt.plot(thresholds_largedataset, pairwise_precision, label='Precision', color='r')
    plt.plot(thresholds_largedataset, pairwise_recall, label='Recall', color='b')
    plt.plot(thresholds_largedataset, pairwise_f1, label='F1', color='g')
    plt.plot(thresholds_largedataset, precision_lower_bound, label='Precision Bound', color='r', linestyle=':')
    plt.plot(thresholds_largedataset, recall_lower_bound, label='Recall Bound', color='b', linestyle=':')
    plt.plot(thresholds_largedataset, f1_lower_bound, label='F1 Bound', color='g', linestyle=':')
    i = np.argmax(np.array(f1_lower_bound))
    large_optimal_threshold = thresholds_largedataset[i]
    print 'Optimal large threshold automatically set at =', large_optimal_threshold
    print 'If not correct: debug.'
    plt.xlabel('Threshold')
    plt.legend()
    plt.ylabel('Score')
    plt.title('Optimizing ER on large dataset')
    plt.show()

    ## Run on all dataset sizes
    #new_metrics_list = list()
    database_sizes = list()
    small_pairwise_precision = list()
    small_pairwise_recall = list()
    small_pairwise_f1 = list()
    large_precision_bound = list()
    large_precision_bound_lower_ci = list()
    large_precision_bound_upper_ci = list()
    large_precision = list()
    large_recall_bound = list()
    large_recall_bound_lower_ci = list()
    large_recall_bound_upper_ci = list()
    large_recall = list()
    large_f1 = list()
    large_f1_bound = list()
    for db in databases:
        print 'Analyzing synthetic database with', len(db.database.records), 'records'
        database_sizes.append(len(db.database.records))
        weak_match_function.set_decision_threshold(small_optimal_threshold)
        labels_pred = er.run(db.database, weak_match_function, single_block=True, max_block_size=np.Inf, cores=1)
        met = Metrics(db.labels, labels_pred)
        small_pairwise_precision.append(met.pairwise_precision)
        small_pairwise_recall.append(met.pairwise_recall)
        small_pairwise_f1.append(met.pairwise_f1)
        weak_match_function.set_decision_threshold(large_optimal_threshold)
        labels_pred = er.run(db.database, weak_match_function, single_block=True, max_block_size=np.Inf, cores=1)
        met = Metrics(db.labels, labels_pred)
        large_precision.append(met.pairwise_precision)
        large_recall.append(met.pairwise_recall)
        large_f1.append(met.pairwise_f1)
        class_balance_test = count_pairwise_class_balance(db.labels)
        new_metric = NewMetrics(db.database, labels_pred, weak_match_function, class_balance_test)
        large_precision_bound.append(new_metric.precision_lower_bound)
        large_recall_bound.append(new_metric.recall_lower_bound)
        large_f1_bound.append(new_metric.f1_lower_bound)
        large_precision_bound_lower_ci.append(new_metric.precision_lower_bound_lower_ci)
        large_precision_bound_upper_ci.append(new_metric.precision_lower_bound_upper_ci)
        large_recall_bound_lower_ci.append(new_metric.recall_lower_bound_lower_ci)
        large_recall_bound_upper_ci.append(new_metric.recall_lower_bound_upper_ci)

    with open('synthetic_sizes_temp.csv', 'wb') as f:
        f.write('Database size, Precision (small opt), Recall (small opt), F1 (small opt), Precision (large opt), Precision bound (large opt), Lower CI, Upper CI, Recall (large opt), Recall bound (large opt), Lower CI, Upper CI, F1 (large opt), F1 bound (large opt)\n')
        writer = csv.writer(f)
        writer.writerows(izip(database_sizes, small_pairwise_precision, small_pairwise_recall, small_pairwise_f1, large_precision, large_precision_bound, large_precision_bound_lower_ci, large_precision_bound_upper_ci, large_recall, large_recall_bound, large_recall_bound_lower_ci, large_recall_bound_upper_ci, large_f1, large_f1_bound))
    f.close()
    plt.figure()
    plt.plot(database_sizes, pairwise_precision, label='Precision', color='#4477AA', linewidth=3)
    plt.plot(database_sizes, pairwise_recall, label='Recall', color='#CC6677', linewidth=3)
    #plt.plot(database_sizes, pairwise_f1, label='F1', color='#DDCC77', linewidth=2)
    plt.ylim([0, 1.05])
    plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    plt.legend(title='Pairwise:', loc='lower left')
    plt.xlabel('Number of Records')
    plt.ylabel('Pairwise Score')
    plt.title('Performance Degredation')
    plt.show()