Example #1
0
#
class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary

classifier.train(class_w_vec_dict, set(), set())

# Classify all weight vectors
#
[m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict)

# -----------------------------------------------------------------------------

# Define output file options
#
histo_str_list = output.GenerateHistogram(
    class_w_vec_dict, 1.0, "/home/rodrigo/Projetos/reclink/teste/histogram")

for line in histo_str_list:
    print line
output.SaveMatchStatusFile(
    class_w_vec_dict, m_set,
    "/home/rodrigo/Projetos/reclink/teste/match-status")

output.SaveMatchDataSet(
    m_set, data_set_a, "match_id",
    "/home/rodrigo/Projetos/reclink/teste/sdf1103-match.csv", data_set_b,
    "match_id", "/home/rodrigo/Projetos/reclink/teste/sdf1703-match.csv")

# =============================================================================
# End of Febrl project module: "SeguroDefeso.py"
# =============================================================================
Example #2
0
def index_and_classify(ds_size, corruption, index_def, index_method,
                       field_list, param_index):

    # print '%s %s %d:\n' % (index_method_dic[index_method], field_list, param_index)

    # init_logger(ds_size, index_method, field_list, param_index)

    # Build and compact index
    #

    blocking_start_time = time.time()

    index_def.build()

    index_def.compact()

    # Do record pair comparisons
    #
    [field_names_list, w_vec_dict] = index_def.run()

    blocking_end_time = time.time()

    blocking_time = blocking_end_time - blocking_start_time

    # -----------------------------------------------------------------------------

    # Define weight vector (record pair) classifier
    #

    comparison_start_time = time.time()

    classifier = classification.FellegiSunter(lower_threshold=0.99,
                                              upper_threshold=0.99)

    # Unsupervised training of classifier
    #
    class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary

    classifier.train(class_w_vec_dict, set(), set())

    # Classify all weight vectors
    #
    [m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict)

    comparison_end_time = time.time()

    comparison_time = comparison_end_time - comparison_start_time

    # -----------------------------------------------------------------------------

    # Define output file options
    #
    histo_str_list = output.GenerateHistogram(class_w_vec_dict, 1.0)

    # for line in histo_str_list:
    #     print line
    output.SaveMatchStatusFile(
        class_w_vec_dict, m_set,
        "D:/Data/LinkageWithMissingValues/FL/FL16/sampled/results/dup_%d_%d_%s_%s_%d.txt"
        % (ds_size_list[0], corruption, index_method_dic[index_method],
           field_list, param_index))

    print(histo_str_list)

    non_match_count = 0
    match_count = 0

    if len(histo_str_list) == 7:
        non_match_count = int(histo_str_list[4].split('|')[0].strip())
        match_count = int(histo_str_list[5].split('|')[0].strip())
    else:
        non_match_count = 0
        match_count = int(histo_str_list[4].split('|')[0].strip())
    recall = match_count / (ds_size * duplicate_percentage)
    total_comparisons = non_match_count + match_count
    reduction_ratio = (total_comparisons * 100) / ((ds_size *
                                                    (ds_size - 1)) / 2.0)

    print '%d, %.2f, %.2f, %d, %.2f, %.2f' % (
        match_count, recall, reduction_ratio, total_comparisons, blocking_time,
        comparison_time)
    all_result_file.write('%d, %.2f, %.2f, %d, %.2f, %.2f\n' %
                          (match_count, recall, reduction_ratio,
                           total_comparisons, blocking_time, comparison_time))
Example #3
0
# Define weight vector (record pair) classifier
#
classifier = classification.FellegiSunter(lower_threshold = 0.3,
                                          upper_threshold = 0.8)

# Unsupervised training of classifier
#
class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary

classifier.train(class_w_vec_dict, set(), set())

# Classify all weight vectors
#
[m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict)

# -----------------------------------------------------------------------------

# Define output file options
#
histo_str_list = output.GenerateHistogram(class_w_vec_dict, 1.0)

for line in histo_str_list:
  print line
output.SaveMatchStatusFile(class_w_vec_dict, m_set, "C:\Users\Abdullah\Dropbox\rla\TestData\DataOut5k.txt")


# =============================================================================
# End of Febrl project module: "C05.py"
# =============================================================================
Example #4
0
def run():

    for ds in ['fl', 'nc']:
        if (ds == 'fl'):
            ds_dir = fl_ds_dir
            ds_keys_list = fl_keys_list
            ds_field_list = fl_field_list
        elif (ds == 'nc'):
            ds_dir = nc_ds_dir
            ds_keys_list = nc_keys_list
            ds_field_list = nc_field_list

        for corruption_percentage in corruption_percentage_list:

            for missing_percentage in missing_percentage_list:

                if (corruption_percentage == 5 and missing_percentage == 20):
                    continue

                ds_path = os.path.join(
                    ds_dir, '{}_missing_{}_corruption_{}.txt'.format(
                        ds_size, missing_percentage, corruption_percentage))

                # Define input data set A:
                #
                data_set_a = dataset.DataSetCSV(description="Data set",
                                                access_mode="read",
                                                strip_fields=True,
                                                miss_val=[''],
                                                rec_ident='r',
                                                file_name=ds_path,
                                                header_line=False,
                                                delimiter=",",
                                                field_list=ds_field_list)

                # -----------------------------------------------------------------------------

                # Define field comparison functions
                #
                fc_funct_1 = comparison.FieldComparatorExactString(
                    agree_weight=1.0,
                    description="Str-Exact-field-17-field-17",
                    disagree_weight=0.0,
                    missing_weight=0.0)

                field_comp_list = [(fc_funct_1, "identifier", "identifier")]

                rec_comp = comparison.RecordComparator(data_set_a, data_set_a,
                                                       field_comp_list)

                # -----------------------------------------------------------------------------

                for keys_idx, keys in enumerate(ds_keys_list):

                    for method_idx, method in enumerate(methods):

                        #                         if(method_idx == 4):
                        #                             continue

                        for is_tight in [True, False]:

                            result_file_path = myutil.get_result_path(
                                ds_path, method, keys_idx, is_tight)
                            result_file = open(result_file_path, 'a')

                            for params_idx, params in enumerate(
                                    get_params_list(method_idx, keys_idx,
                                                    is_tight)):

                                if (is_params_not_allowed(
                                        keys_idx, method_idx, is_tight,
                                        params_idx)
                                        or myutil.is_result_already_stored(
                                            result_file_path, params_idx)):
                                    continue

                                print(params)

                                index_def = get_index_def(
                                    method_idx, keys, params, data_set_a,
                                    rec_comp)

                                # init_logger

                                for handler in logging.root.handlers[:]:
                                    logging.root.removeHandler(handler)

                                log_file_path = myutil.get_log_path(
                                    ds_path, method, keys_idx, is_tight,
                                    params_idx)
                                logging.basicConfig(filename=log_file_path,
                                                    filemode='w',
                                                    level=logging.INFO)
                                logging.getLogger()

                                # ----------------------------------------------------------------------------

                                blocking_start_time = time.time()

                                # Build and compact index
                                index_def.build()
                                index_def.compact()
                                # Do record pair comparisons
                                [field_names_list,
                                 w_vec_dict] = index_def.run()

                                blocking_end_time = time.time()
                                blocking_time = blocking_end_time - blocking_start_time

                                # -----------------------------------------------------------------------------

                                comparison_start_time = time.time()

                                # Define weight vector (record pair) classifier
                                classifier = classification.FellegiSunter(
                                    lower_threshold=0.99, upper_threshold=0.99)
                                # Unsupervised training of classifier
                                class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary
                                classifier.train(class_w_vec_dict, set(),
                                                 set())
                                # Classify all weight vectors
                                [m_set, nm_set, pm_set
                                 ] = classifier.classify(class_w_vec_dict)

                                comparison_end_time = time.time()
                                comparison_time = comparison_end_time - comparison_start_time

                                # -----------------------------------------------------------------------------

                                # Define output file options
                                #
                                histo_str_list = output.GenerateHistogram(
                                    class_w_vec_dict, 1.0)
                                print(histo_str_list)

                                match_count, recall, reduction_ratio, total_comparisons = myutil.get_metrics(
                                    ds_size, duplicate_percentage,
                                    histo_str_list)

                                # for line in histo_str_list:
                                #     print line
                                match_file_path = myutil.get_matches_path(
                                    ds_path, method, keys_idx, is_tight,
                                    params_idx)
                                output.SaveMatchStatusFile(
                                    class_w_vec_dict, m_set, match_file_path)

                                print(
                                    '{} {} {}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n'
                                    .format(method, keys_idx, params_idx,
                                            match_count, recall,
                                            reduction_ratio, total_comparisons,
                                            blocking_time, comparison_time))
                                result_file.write(
                                    '{}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n'
                                    .format(params_idx, match_count, recall,
                                            reduction_ratio, total_comparisons,
                                            blocking_time, comparison_time))
                                result_file.flush()