def main(): path = '/home/scratch/trafficjam/rebuild/annotations.csv' header_path = '/home/scratch/trafficjam/entity_resolution_inputs/rebuild_annotations_header.csv' out = open('/home/scratch/trafficjam/entity_resolution_inputs/rebuild_phone_clusters_faster.csv', 'w') out.write('annotations_line (0 indexed), text_line (1 indexed), cluster_id\n') database = Database(path, header_path=header_path) strong_clusters = fast_strong_cluster(database) line_indices = strong_clusters.keys() line_indices.sort() for line_index in line_indices: poster_id = database.records[line_index].features[0] if poster_id: (poster_id,) = poster_id # unpack from set else: poster_id = '' cluster_id = strong_clusters[line_index] out.write(str(line_index)+','+str(poster_id)+','+str(cluster_id)+'\n') out.close()
def main(): path = '/home/scratch/trafficjam/rebuild/annotations.csv' header_path = '/home/scratch/trafficjam/entity_resolution_inputs/rebuild_annotations_header.csv' out = open( '/home/scratch/trafficjam/entity_resolution_inputs/rebuild_phone_clusters_faster.csv', 'w') out.write( 'annotations_line (0 indexed), text_line (1 indexed), cluster_id\n') database = Database(path, header_path=header_path) strong_clusters = fast_strong_cluster(database) line_indices = strong_clusters.keys() line_indices.sort() for line_index in line_indices: poster_id = database.records[line_index].features[0] if poster_id: (poster_id, ) = poster_id # unpack from set else: poster_id = '' cluster_id = strong_clusters[line_index] out.write( str(line_index) + ',' + str(poster_id) + ',' + str(cluster_id) + '\n') out.close()
def main(): """ Runs a single entity resolution on data (real or synthetic) using a match function (logistic regression, decision tree, or random forest) """ data_type = 'real' decision_threshold = 0.7 train_class_balance = 0.5 max_block_size = 1000 cores = 2 if data_type == 'synthetic': database_train = SyntheticDatabase(100, 10, 10) corruption = 0.1 corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_train.database.feature_descriptor.number]) database_train.corrupt(corruption_array) database_validation = SyntheticDatabase(100, 10, 10) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_validation.database.feature_descriptor.number]) database_validation.corrupt(corruption_array) database_test = SyntheticDatabase(10, 10, 10) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_test.database.feature_descriptor.number]) database_test.corrupt(corruption_array) labels_train = database_train.labels labels_validation = database_validation.labels labels_test = database_test.labels database_train = database_train.database database_validation = database_validation.database database_test = database_test.database single_block = True elif data_type == 'real': # Uncomment to use all features (annotations and LM) #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') # Uncomment to only use annotation features #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') # Uncomment to only use LM features database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') labels_train = fast_strong_cluster(database_train) labels_validation = fast_strong_cluster(database_validation) labels_test = fast_strong_cluster(database_test) single_block = False else: Exception('Invalid experiment type'+data_type) entities = deepcopy(database_test) blocking_scheme = BlockingScheme(entities, max_block_size, single_block=single_block) train_seed = generate_pair_seed(database_train, labels_train, train_class_balance, require_direct_match=True, max_minor_class=5000) validation_seed = generate_pair_seed(database_validation, labels_validation, 0.5, require_direct_match=True, max_minor_class=5000) # forest_all = ForestMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # forest_all.test(database_all_validation, labels_validation, validation_seed) # tree_all = TreeMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # tree_all.test(database_all_validation, labels_validation, validation_seed) # logistic_all = LogisticMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # logistic_all.test(database_all_validation, labels_validation, validation_seed) forest_annotations = ForestMatchFunction(database_train, labels_train, train_seed, decision_threshold) roc = forest_annotations.test(database_validation, labels_validation, validation_seed) #roc.make_plot() #plt.show() # tree_annotations = TreeMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold) # tree_annotations.test(database_annotations_validation, labels_validation, validation_seed) # logistic_annotations = LogisticMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold) # logistic_annotations.test(database_annotations_validation, labels_validation, validation_seed) # forest_LM = ForestMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # forest_LM.test(database_LM_validation, labels_validation, validation_seed) # tree_LM = TreeMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # tree_LM.test(database_LM_validation, labels_validation, validation_seed) # logistic_LM = LogisticMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # logistic_LM.test(database_LM_validation, labels_validation, validation_seed) # forest_all.roc.write_rates('match_forest_all.csv') # tree_all.roc.write_rates('match_tree_all.csv') # logistic_all.roc.write_rates('match_logistic_all.csv') # # forest_annotations.roc.write_rates('match_forest_annotations.csv') # tree_annotations.roc.write_rates('match_tree_annotations.csv') # logistic_annotations.roc.write_rates('match_logistic_annotations.csv') # # forest_LM.roc.write_rates('match_forest_LM.csv') # tree_LM.roc.write_rates('match_tree_LM.csv') # logistic_LM.roc.write_rates('match_logistic_LM.csv') # ax = forest_all.roc.make_plot() # _ = tree_all.roc.make_plot(ax=ax) # _ = logistic_all.roc.make_plot(ax=ax) # plt.show() #forest_annotations.roc.make_plot() #plt.show() #entities.merge(strong_labels) #er = EntityResolution() #weak_labels = er.run(entities, match_function, blocking_scheme, cores=cores) weak_labels = weak_connected_components(database_test, forest_annotations, blocking_scheme) entities.merge(weak_labels) #strong_labels = fast_strong_cluster(entities) #entities.merge(strong_labels) # out = open('ER.csv', 'w') # out.write('phone,cluster_id\n') # for cluster_counter, (entity_id, entity) in enumerate(entities.records.iteritems()): # phone_index = 21 # for phone in entity.features[phone_index]: # out.write(str(phone)+','+str(cluster_counter)+'\n') # out.close() print 'Metrics using strong features as surrogate label. Entity resolution run using weak and strong features' metrics = Metrics(labels_test, weak_labels) # estimated_test_class_balance = count_pairwise_class_balance(labels_test) # new_metrics = NewMetrics(database_all_test, weak_labels, forest_all, estimated_test_class_balance) metrics.display()
def experiment_wrapper(dataset_name): """ Experiment wrapper, just takes the type of experiment, all parameters saved here :param dataset_name: Name of the database to run on, either synthetic, restaurant, abt-buy, trafficking """ if dataset_name == 'synthetic': number_entities = 100 records_per_entity = 10 train_database_size = 200 train_class_balance = 0.5 validation_database_size = 200 corruption = 0.001 #0.025 number_thresholds = 30 number_features = 10 synthetic_database = SyntheticDatabase(number_entities, records_per_entity, number_features=number_features) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[validation_database_size, synthetic_database.database.feature_descriptor.number]) synthetic_database.corrupt(corruption_array) synthetic_train = synthetic_database.sample_and_remove(train_database_size) synthetic_validation = synthetic_database.sample_and_remove(validation_database_size) synthetic_test = synthetic_database thresholds = np.linspace(0, 1, number_thresholds) experiment = Experiment(synthetic_train.database, synthetic_validation.database, synthetic_test.database, synthetic_train.labels, synthetic_validation.labels, synthetic_test.labels, train_class_balance, thresholds) experiment.plot() else: number_thresholds = 5 if dataset_name == 'restaurant': # 864 records, 112 matches features_path = '../data/restaurant/merged.csv' labels_path = '../data/restaurant/labels.csv' train_database_size = 300 train_class_balance = .4 validation_database_size = 200 database = Database(annotation_path=features_path) elif dataset_name == 'abt-buy': # ~4900 records, 1300 matches features_path = '../data/Abt-Buy/merged.csv' labels_path = '../data/Abt-Buy/labels.csv' train_database_size = 300 train_class_balance = 0.4 validation_database_size = 300 database = Database(annotation_path=features_path) elif dataset_name == 'trafficking': features_path = '../data/trafficking/features.csv' labels_path = '../data/trafficking/labels.csv' train_database_size = 300 train_class_balance = 0.5 validation_database_size = 300 #database = Database(annotation_path=features_path) else: raise Exception('Invalid dataset name') thresholds = np.linspace(0, 1, number_thresholds) # labels = np.loadtxt(open(labels_path, 'rb')) # database_train = database.sample_and_remove(train_database_size) # database_validation = database.sample_and_remove(validation_database_size) # database_test = database # labels_train = dict() # labels_validation = dict() # labels_test = dict() # for identifier, label in enumerate(labels): # if identifier in database_train.records: # labels_train[identifier] = label # elif identifier in database_validation.records: # labels_validation[identifier] = label # elif identifier in database_test.records: # labels_test[identifier] = label # else: # raise Exception('Record identifier ' + str(identifier) + ' not in either database') ### database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000) database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000) database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=1000) labels_train = fast_strong_cluster(database_train) labels_validation = fast_strong_cluster(database_validation) labels_test = fast_strong_cluster(database_test) ### experiment = Experiment(database_train, database_validation, database_test, labels_train, labels_validation, labels_test, train_class_balance, thresholds) #print 'Saving results' #pickle.dump(experiment, open('experiment.p', 'wb')) experiment.plot() print 'Finished'