# class_w_vec_dict = w_vec_dict # Use orignal weight vector dictionary classifier.train(class_w_vec_dict, set(), set()) # Classify all weight vectors # [m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict) # ----------------------------------------------------------------------------- # Define output file options # histo_str_list = output.GenerateHistogram( class_w_vec_dict, 1.0, "/home/rodrigo/Projetos/reclink/teste/histogram") for line in histo_str_list: print line output.SaveMatchStatusFile( class_w_vec_dict, m_set, "/home/rodrigo/Projetos/reclink/teste/match-status") output.SaveMatchDataSet( m_set, data_set_a, "match_id", "/home/rodrigo/Projetos/reclink/teste/sdf1103-match.csv", data_set_b, "match_id", "/home/rodrigo/Projetos/reclink/teste/sdf1703-match.csv") # ============================================================================= # End of Febrl project module: "SeguroDefeso.py" # =============================================================================
def index_and_classify(ds_size, corruption, index_def, index_method, field_list, param_index): # print '%s %s %d:\n' % (index_method_dic[index_method], field_list, param_index) # init_logger(ds_size, index_method, field_list, param_index) # Build and compact index # blocking_start_time = time.time() index_def.build() index_def.compact() # Do record pair comparisons # [field_names_list, w_vec_dict] = index_def.run() blocking_end_time = time.time() blocking_time = blocking_end_time - blocking_start_time # ----------------------------------------------------------------------------- # Define weight vector (record pair) classifier # comparison_start_time = time.time() classifier = classification.FellegiSunter(lower_threshold=0.99, upper_threshold=0.99) # Unsupervised training of classifier # class_w_vec_dict = w_vec_dict # Use orignal weight vector dictionary classifier.train(class_w_vec_dict, set(), set()) # Classify all weight vectors # [m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict) comparison_end_time = time.time() comparison_time = comparison_end_time - comparison_start_time # ----------------------------------------------------------------------------- # Define output file options # histo_str_list = output.GenerateHistogram(class_w_vec_dict, 1.0) # for line in histo_str_list: # print line output.SaveMatchStatusFile( class_w_vec_dict, m_set, "D:/Data/LinkageWithMissingValues/FL/FL16/sampled/results/dup_%d_%d_%s_%s_%d.txt" % (ds_size_list[0], corruption, index_method_dic[index_method], field_list, param_index)) print(histo_str_list) non_match_count = 0 match_count = 0 if len(histo_str_list) == 7: non_match_count = int(histo_str_list[4].split('|')[0].strip()) match_count = int(histo_str_list[5].split('|')[0].strip()) else: non_match_count = 0 match_count = int(histo_str_list[4].split('|')[0].strip()) recall = match_count / (ds_size * duplicate_percentage) total_comparisons = non_match_count + match_count reduction_ratio = (total_comparisons * 100) / ((ds_size * (ds_size - 1)) / 2.0) print '%d, %.2f, %.2f, %d, %.2f, %.2f' % ( match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time) all_result_file.write('%d, %.2f, %.2f, %d, %.2f, %.2f\n' % (match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time))
# Define weight vector (record pair) classifier # classifier = classification.FellegiSunter(lower_threshold = 0.3, upper_threshold = 0.8) # Unsupervised training of classifier # class_w_vec_dict = w_vec_dict # Use orignal weight vector dictionary classifier.train(class_w_vec_dict, set(), set()) # Classify all weight vectors # [m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict) # ----------------------------------------------------------------------------- # Define output file options # histo_str_list = output.GenerateHistogram(class_w_vec_dict, 1.0) for line in histo_str_list: print line output.SaveMatchStatusFile(class_w_vec_dict, m_set, "C:\Users\Abdullah\Dropbox\rla\TestData\DataOut5k.txt") # ============================================================================= # End of Febrl project module: "C05.py" # =============================================================================
def run(): for ds in ['fl', 'nc']: if (ds == 'fl'): ds_dir = fl_ds_dir ds_keys_list = fl_keys_list ds_field_list = fl_field_list elif (ds == 'nc'): ds_dir = nc_ds_dir ds_keys_list = nc_keys_list ds_field_list = nc_field_list for corruption_percentage in corruption_percentage_list: for missing_percentage in missing_percentage_list: if (corruption_percentage == 5 and missing_percentage == 20): continue ds_path = os.path.join( ds_dir, '{}_missing_{}_corruption_{}.txt'.format( ds_size, missing_percentage, corruption_percentage)) # Define input data set A: # data_set_a = dataset.DataSetCSV(description="Data set", access_mode="read", strip_fields=True, miss_val=[''], rec_ident='r', file_name=ds_path, header_line=False, delimiter=",", field_list=ds_field_list) # ----------------------------------------------------------------------------- # Define field comparison functions # fc_funct_1 = comparison.FieldComparatorExactString( agree_weight=1.0, description="Str-Exact-field-17-field-17", disagree_weight=0.0, missing_weight=0.0) field_comp_list = [(fc_funct_1, "identifier", "identifier")] rec_comp = comparison.RecordComparator(data_set_a, data_set_a, field_comp_list) # ----------------------------------------------------------------------------- for keys_idx, keys in enumerate(ds_keys_list): for method_idx, method in enumerate(methods): # if(method_idx == 4): # continue for is_tight in [True, False]: result_file_path = myutil.get_result_path( ds_path, method, keys_idx, is_tight) result_file = open(result_file_path, 'a') for params_idx, params in enumerate( get_params_list(method_idx, keys_idx, is_tight)): if (is_params_not_allowed( keys_idx, method_idx, is_tight, params_idx) or myutil.is_result_already_stored( result_file_path, params_idx)): continue print(params) index_def = get_index_def( method_idx, keys, params, data_set_a, rec_comp) # init_logger for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) log_file_path = myutil.get_log_path( ds_path, method, keys_idx, is_tight, params_idx) logging.basicConfig(filename=log_file_path, filemode='w', level=logging.INFO) logging.getLogger() # ---------------------------------------------------------------------------- blocking_start_time = time.time() # Build and compact index index_def.build() index_def.compact() # Do record pair comparisons [field_names_list, w_vec_dict] = index_def.run() blocking_end_time = time.time() blocking_time = blocking_end_time - blocking_start_time # ----------------------------------------------------------------------------- comparison_start_time = time.time() # Define weight vector (record pair) classifier classifier = classification.FellegiSunter( lower_threshold=0.99, upper_threshold=0.99) # Unsupervised training of classifier class_w_vec_dict = w_vec_dict # Use orignal weight vector dictionary classifier.train(class_w_vec_dict, set(), set()) # Classify all weight vectors [m_set, nm_set, pm_set ] = classifier.classify(class_w_vec_dict) comparison_end_time = time.time() comparison_time = comparison_end_time - comparison_start_time # ----------------------------------------------------------------------------- # Define output file options # histo_str_list = output.GenerateHistogram( class_w_vec_dict, 1.0) print(histo_str_list) match_count, recall, reduction_ratio, total_comparisons = myutil.get_metrics( ds_size, duplicate_percentage, histo_str_list) # for line in histo_str_list: # print line match_file_path = myutil.get_matches_path( ds_path, method, keys_idx, is_tight, params_idx) output.SaveMatchStatusFile( class_w_vec_dict, m_set, match_file_path) print( '{} {} {}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n' .format(method, keys_idx, params_idx, match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time)) result_file.write( '{}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n' .format(params_idx, match_count, recall, reduction_ratio, total_comparisons, blocking_time, comparison_time)) result_file.flush()