census_entity_id_exact = comparison.FieldComparatorExactString(desc = \ 'entity_id_exact') census_surname_winkler = comparison.FieldComparatorWinkler(thres=0, desc = 'surname_winkler') census_given_name_winkler = comparison.FieldComparatorWinkler(thres=0, desc = 'given_name_winkler') census_suburb_winkler = comparison.FieldComparatorWinkler(thres=0, desc = 'suburb_winkler') census_fc_list = [(census_entity_id_exact, 'entity_id', 'entity_id'), (census_surname_winkler, 'surname', 'surname'), (census_given_name_winkler, 'given_name', 'given_name'), (census_suburb_winkler, 'suburb', 'suburb')] census_rec_comp = comparison.RecordComparator(census_ds_A, census_ds_B, census_fc_list, 'Census record comparator') # Function to be used to check for true matches and non-matches # def census_check_funct(rec1, rec2): return (rec1[1] == rec2[1]) # Function to be used to extract the record identifier from a raw record # def census_get_id_funct(rec): return rec[1] # Insert into data set dictionary # experiment_dict['census'] = ['Census', census_ds_A, census_ds_B,
fc_funct_4 = comparison.FieldComparatorWinkler(agree_weight = 1.0, description = "Winkler-Description-Description", disagree_weight = 0.0, missing_weight = 0.0, threshold = 0.0, check_sim = True, check_init = True, check_long = True) field_comp_list = [(fc_funct_1, "Subject", "Subject"), (fc_funct_2, "Creator", "Creator"), (fc_funct_3, "Title", "Title"), (fc_funct_4, "Description", "Description")] rec_comp = comparison.RecordComparator(data_set_a, data_set_a, field_comp_list) # ----------------------------------------------------------------------------- # Define indices for "blocking" # index = indexing.FullIndex(dataset1 = data_set_a, dataset2 = data_set_a, weight_vec_file = "/home/jclark/projects/dpla_appfest/match_weights", progress_report = 1, rec_comparator = rec_comp, index_sep_str = "", skip_missing = True, index_def = []) # Build and compact index
census_surname_winkler = comparison.FieldComparatorWinkler( thres=0, desc="surname_winkler") census_given_name_winkler = comparison.FieldComparatorWinkler( thres=0, desc="given_name_winkler") census_suburb_winkler = comparison.FieldComparatorWinkler( thres=0, desc="suburb_winkler") census_fc_list = [ (census_entity_id_exact, "entity_id", "entity_id"), (census_surname_winkler, "surname", "surname"), (census_given_name_winkler, "given_name", "given_name"), (census_suburb_winkler, "suburb", "suburb"), ] census_rec_comp = comparison.RecordComparator(census_ds_A, census_ds_B, census_fc_list, "Census record comparator") # Function to be used to check for true matches and non-matches # def census_check_funct(rec1, rec2): return rec1[1] == rec2[1] # Function to be used to extract the record identifier from a raw record # def census_get_id_funct(rec): return rec[1]
def run(): for ds in ['fl', 'nc']: if (ds == 'fl'): ds_dir = fl_ds_dir ds_keys_list = fl_keys_list ds_field_list = fl_field_list elif (ds == 'nc'): ds_dir = nc_ds_dir ds_keys_list = nc_keys_list ds_field_list = nc_field_list for corruption_percentage in corruption_percentage_list: for missing_percentage in missing_percentage_list: if (corruption_percentage == 5 and missing_percentage == 20): continue ds_path = os.path.join( ds_dir, '{}_missing_{}_corruption_{}.txt'.format( ds_size, missing_percentage, corruption_percentage)) # Define input data set A: # data_set_a = dataset.DataSetCSV(description="Data set", access_mode="read", strip_fields=True, miss_val=[''], rec_ident='r', file_name=ds_path, header_line=False, delimiter=",", field_list=ds_field_list) # ----------------------------------------------------------------------------- # Define field comparison functions # fc_funct_1 = comparison.FieldComparatorExactString( agree_weight=1.0, description="Str-Exact-field-17-field-17", disagree_weight=0.0, missing_weight=0.0) field_comp_list = [(fc_funct_1, "identifier", "identifier")] rec_comp = comparison.RecordComparator(data_set_a, data_set_a, field_comp_list) # ----------------------------------------------------------------------------- for keys_idx, keys in enumerate(ds_keys_list): for method_idx, method in enumerate(methods): # if(method_idx == 4): # continue for is_tight in [True, False]: result_file_path = myutil.get_result_path( ds_path, method, keys_idx, is_tight) result_file = open(result_file_path, 'a') for params_idx, params in enumerate( get_params_list(method_idx, keys_idx, is_tight)): if (is_params_not_allowed( keys_idx, method_idx, is_tight, params_idx) or myutil.is_result_already_stored( result_file_path, params_idx)): continue print(params) index_def = get_index_def( method_idx, keys, params, data_set_a, rec_comp) # init_logger for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) log_file_path = myutil.get_log_path( ds_path, method, keys_idx, is_tight, params_idx) logging.basicConfig(filename=log_file_path, filemode='w', level=logging.INFO) logging.getLogger() # ---------------------------------------------------------------------------- blocking_start_time = time.time() # Build and compact index index_def.build() index_def.compact() # Do record pair comparisons [field_names_list, w_vec_dict] = index_def.run() blocking_end_time = time.time() blocking_time = blocking_end_time - blocking_start_time # ----------------------------------------------------------------------------- comparison_start_time = time.time() # Define weight vector (record pair) classifier classifier = classification.FellegiSunter( lower_threshold=0.99, upper_threshold=0.99) # Unsupervised training of classifier class_w_vec_dict = w_vec_dict # Use orignal weight vector dictionary classifier.train(class_w_vec_dict, set(), set()) # Classify all weight vectors [m_set, nm_set, pm_set ] = classifier.classify(class_w_vec_dict) comparison_end_time = time.time() comparison_time = comparison_end_time - comparison_start_time # ----------------------------------------------------------------------------- # Define output file options # histo_str_list = output.GenerateHistogram( class_w_vec_dict, 1.0) print(histo_str_list) match_count, recall, reduction_ratio, total_comparisons = myutil.get_metrics( ds_size, duplicate_percentage, histo_str_list) # for line in histo_str_list: # print line match_file_path = myutil.get_matches_path( ds_path, method, keys_idx, is_tight, params_idx) output.SaveMatchStatusFile( class_w_vec_dict, m_set, match_file_path) print( '{} {} {}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n' .format(method, keys_idx, params_idx, match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time)) result_file.write( '{}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n' .format(params_idx, match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time)) result_file.flush()