Python SyntheticDatabase.add Exemples

Langage de programmation: Python

Espace de nommage/Pack: database

Méthode/Fonction: add

Exemples au hotexamples.com: 2

Python SyntheticDatabase.add - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de database.SyntheticDatabase.add extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

SyntheticDatabase(9)

corrupt(4)

add(2)

sample_and_remove(2)

plot(1)

Méthodes fréquemment utilisées

SyntheticDatabase (9)

corrupt (4)

add (2)

sample_and_remove (2)

plot (1)

Exemple #1

0

Afficher le fichier

def test_synthetic_add(self): synthetic = SyntheticDatabase(10, 10, number_features=3) # 100 records, 10 entities, 10 records each self.assertEqual(len(synthetic.database.records), 100) synthetic.add(5, 10) self.assertEqual(len(synthetic.database.records), 150)

Exemple #2

0

Afficher le fichier

Fichier : experiments.py Projet : mbarnes1/entity_resolution

def synthetic_sizes(): """ Sizes experiment here """ resolution = 88 number_features = 10 number_entities = np.linspace(10, 100, num=resolution) number_entities = number_entities.astype(int) records_per_entity = 10 #train_database_size = 100 train_class_balance = 0.5 #validation_database_size = 100 corruption_multiplier = .001 databases = list() db = SyntheticDatabase(number_entities[0], records_per_entity, number_features=number_features) databases.append(deepcopy(db)) add_entities = [x - number_entities[i - 1] for i, x in enumerate(number_entities)][1:] for add in add_entities: db.add(add, records_per_entity) databases.append(deepcopy(db)) corruption = np.random.normal(loc=0.0, scale=1.0, size=[number_entities[-1]*records_per_entity, number_features]) train = deepcopy(databases[0]) validation = deepcopy(databases[0]) train.corrupt(corruption_multiplier*np.random.normal(loc=0.0, scale=1.0, size=[len(train.database.records), number_features])) validation.corrupt(corruption_multiplier*np.random.normal(loc=0.0, scale=1.0, size=[len(train.database.records), number_features])) for db in databases: db.corrupt(corruption_multiplier*corruption[:len(db.database.records), :]) er = EntityResolution() train_pair_seed = generate_pair_seed(train.database, train.labels, train_class_balance) weak_match_function = LogisticMatchFunction(train.database, train.labels, train_pair_seed, 0.5) ROC = weak_match_function.test(validation.database, validation.labels, 0.5) #ROC.make_plot() ## Optimize ER on small dataset thresholds = np.linspace(0, 1.0, 10) metrics_list = list() #new_metrics_list = list() pairwise_precision = list() pairwise_recall = list() pairwise_f1 = list() for threshold in thresholds: weak_match_function.set_decision_threshold(threshold) labels_pred = er.run(deepcopy(databases[0].database), weak_match_function, single_block=True, max_block_size=np.Inf, cores=1) met = Metrics(databases[0].labels, labels_pred) metrics_list.append(met) pairwise_precision.append(met.pairwise_precision) pairwise_recall.append(met.pairwise_recall) pairwise_f1.append(met.pairwise_f1) #class_balance_test = get_pairwise_class_balance(databases[0].labels) #new_metrics_list.append(NewMetrics(databases[0].database, er, class_balance_test)) plt.plot(thresholds, pairwise_precision, label='Precision') plt.plot(thresholds, pairwise_recall, label='Recall') plt.plot(thresholds, pairwise_f1, label='F1') plt.xlabel('Threshold') plt.legend() plt.ylabel('Score') plt.title('Optimizing ER on small dataset') #i = np.argmax(np.array(pairwise_f1)) #small_optimal_threshold = thresholds[i] # optimize this small_optimal_threshold = 0.6 print 'Optimal small threshold set at =', small_optimal_threshold plt.show() ## Possible score by optimizing on larger dataset metrics_list = list() pairwise_precision = list() pairwise_recall = list() pairwise_f1 = list() thresholds_largedataset = np.linspace(0.6, 1.0, 8) precision_lower_bound = list() recall_lower_bound = list() f1_lower_bound = list() for threshold in thresholds_largedataset: weak_match_function.set_decision_threshold(threshold) labels_pred = er.run(deepcopy(databases[-1].database), weak_match_function, single_block=True, max_block_size=np.Inf, cores=1) met = Metrics(databases[-1].labels, labels_pred) metrics_list.append(met) pairwise_precision.append(met.pairwise_precision) pairwise_recall.append(met.pairwise_recall) pairwise_f1.append(met.pairwise_f1) class_balance_test = count_pairwise_class_balance(databases[-1].labels) new_metric = NewMetrics(databases[-1].database, labels_pred, weak_match_function, class_balance_test) precision_lower_bound.append(new_metric.precision_lower_bound) recall_lower_bound.append(new_metric.recall_lower_bound) f1_lower_bound.append(new_metric.f1_lower_bound) plt.plot(thresholds_largedataset, pairwise_precision, label='Precision', color='r') plt.plot(thresholds_largedataset, pairwise_recall, label='Recall', color='b') plt.plot(thresholds_largedataset, pairwise_f1, label='F1', color='g') plt.plot(thresholds_largedataset, precision_lower_bound, label='Precision Bound', color='r', linestyle=':') plt.plot(thresholds_largedataset, recall_lower_bound, label='Recall Bound', color='b', linestyle=':') plt.plot(thresholds_largedataset, f1_lower_bound, label='F1 Bound', color='g', linestyle=':') i = np.argmax(np.array(f1_lower_bound)) large_optimal_threshold = thresholds_largedataset[i] print 'Optimal large threshold automatically set at =', large_optimal_threshold print 'If not correct: debug.' plt.xlabel('Threshold') plt.legend() plt.ylabel('Score') plt.title('Optimizing ER on large dataset') plt.show() ## Run on all dataset sizes #new_metrics_list = list() database_sizes = list() small_pairwise_precision = list() small_pairwise_recall = list() small_pairwise_f1 = list() large_precision_bound = list() large_precision_bound_lower_ci = list() large_precision_bound_upper_ci = list() large_precision = list() large_recall_bound = list() large_recall_bound_lower_ci = list() large_recall_bound_upper_ci = list() large_recall = list() large_f1 = list() large_f1_bound = list() for db in databases: print 'Analyzing synthetic database with', len(db.database.records), 'records' database_sizes.append(len(db.database.records)) weak_match_function.set_decision_threshold(small_optimal_threshold) labels_pred = er.run(db.database, weak_match_function, single_block=True, max_block_size=np.Inf, cores=1) met = Metrics(db.labels, labels_pred) small_pairwise_precision.append(met.pairwise_precision) small_pairwise_recall.append(met.pairwise_recall) small_pairwise_f1.append(met.pairwise_f1) weak_match_function.set_decision_threshold(large_optimal_threshold) labels_pred = er.run(db.database, weak_match_function, single_block=True, max_block_size=np.Inf, cores=1) met = Metrics(db.labels, labels_pred) large_precision.append(met.pairwise_precision) large_recall.append(met.pairwise_recall) large_f1.append(met.pairwise_f1) class_balance_test = count_pairwise_class_balance(db.labels) new_metric = NewMetrics(db.database, labels_pred, weak_match_function, class_balance_test) large_precision_bound.append(new_metric.precision_lower_bound) large_recall_bound.append(new_metric.recall_lower_bound) large_f1_bound.append(new_metric.f1_lower_bound) large_precision_bound_lower_ci.append(new_metric.precision_lower_bound_lower_ci) large_precision_bound_upper_ci.append(new_metric.precision_lower_bound_upper_ci) large_recall_bound_lower_ci.append(new_metric.recall_lower_bound_lower_ci) large_recall_bound_upper_ci.append(new_metric.recall_lower_bound_upper_ci) with open('synthetic_sizes_temp.csv', 'wb') as f: f.write('Database size, Precision (small opt), Recall (small opt), F1 (small opt), Precision (large opt), Precision bound (large opt), Lower CI, Upper CI, Recall (large opt), Recall bound (large opt), Lower CI, Upper CI, F1 (large opt), F1 bound (large opt)\n') writer = csv.writer(f) writer.writerows(izip(database_sizes, small_pairwise_precision, small_pairwise_recall, small_pairwise_f1, large_precision, large_precision_bound, large_precision_bound_lower_ci, large_precision_bound_upper_ci, large_recall, large_recall_bound, large_recall_bound_lower_ci, large_recall_bound_upper_ci, large_f1, large_f1_bound)) f.close() plt.figure() plt.plot(database_sizes, pairwise_precision, label='Precision', color='#4477AA', linewidth=3) plt.plot(database_sizes, pairwise_recall, label='Recall', color='#CC6677', linewidth=3) #plt.plot(database_sizes, pairwise_f1, label='F1', color='#DDCC77', linewidth=2) plt.ylim([0, 1.05]) plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0]) plt.legend(title='Pairwise:', loc='lower left') plt.xlabel('Number of Records') plt.ylabel('Pairwise Score') plt.title('Performance Degredation') plt.show()