Ejemplo n.º 1
0
census_entity_id_exact = comparison.FieldComparatorExactString(desc = \
                                                             'entity_id_exact')
census_surname_winkler =    comparison.FieldComparatorWinkler(thres=0,
                                                      desc = 'surname_winkler')
census_given_name_winkler = comparison.FieldComparatorWinkler(thres=0,
                                                   desc = 'given_name_winkler')
census_suburb_winkler =     comparison.FieldComparatorWinkler(thres=0,
                                                       desc = 'suburb_winkler')

census_fc_list = [(census_entity_id_exact,    'entity_id',    'entity_id'),
                  (census_surname_winkler,    'surname',      'surname'),
                  (census_given_name_winkler, 'given_name',   'given_name'),
                  (census_suburb_winkler,     'suburb',       'suburb')]

census_rec_comp = comparison.RecordComparator(census_ds_A, census_ds_B,
                                              census_fc_list,
                                              'Census record comparator')

# Function to be used to check for true matches and non-matches
#
def census_check_funct(rec1, rec2):
  return (rec1[1] == rec2[1])

# Function to be used to extract the record identifier from a raw record
#
def census_get_id_funct(rec):
  return rec[1]

# Insert into data set dictionary
#
experiment_dict['census'] = ['Census', census_ds_A, census_ds_B,
Ejemplo n.º 2
0
fc_funct_4 = comparison.FieldComparatorWinkler(agree_weight = 1.0,
                                               description = "Winkler-Description-Description",
                                               disagree_weight = 0.0,
                                               missing_weight = 0.0,
                                               threshold = 0.0,
                                               check_sim = True,
                                               check_init = True,
                                               check_long = True)

field_comp_list = [(fc_funct_1, "Subject", "Subject"),
                   (fc_funct_2, "Creator", "Creator"),
                   (fc_funct_3, "Title", "Title"),
                   (fc_funct_4, "Description", "Description")]

rec_comp = comparison.RecordComparator(data_set_a, data_set_a, field_comp_list)

# -----------------------------------------------------------------------------

# Define indices for "blocking"
#
index = indexing.FullIndex(dataset1 = data_set_a,
                           dataset2 = data_set_a,
                           weight_vec_file = "/home/jclark/projects/dpla_appfest/match_weights",
                           progress_report = 1,
                           rec_comparator = rec_comp,
                           index_sep_str = "",
                           skip_missing = True,
                           index_def = [])

# Build and compact index
Ejemplo n.º 3
0
census_surname_winkler = comparison.FieldComparatorWinkler(
    thres=0, desc="surname_winkler")
census_given_name_winkler = comparison.FieldComparatorWinkler(
    thres=0, desc="given_name_winkler")
census_suburb_winkler = comparison.FieldComparatorWinkler(
    thres=0, desc="suburb_winkler")

census_fc_list = [
    (census_entity_id_exact, "entity_id", "entity_id"),
    (census_surname_winkler, "surname", "surname"),
    (census_given_name_winkler, "given_name", "given_name"),
    (census_suburb_winkler, "suburb", "suburb"),
]

census_rec_comp = comparison.RecordComparator(census_ds_A, census_ds_B,
                                              census_fc_list,
                                              "Census record comparator")


# Function to be used to check for true matches and non-matches
#
def census_check_funct(rec1, rec2):
    return rec1[1] == rec2[1]


# Function to be used to extract the record identifier from a raw record
#
def census_get_id_funct(rec):
    return rec[1]

Ejemplo n.º 4
0
def run():

    for ds in ['fl', 'nc']:
        if (ds == 'fl'):
            ds_dir = fl_ds_dir
            ds_keys_list = fl_keys_list
            ds_field_list = fl_field_list
        elif (ds == 'nc'):
            ds_dir = nc_ds_dir
            ds_keys_list = nc_keys_list
            ds_field_list = nc_field_list

        for corruption_percentage in corruption_percentage_list:

            for missing_percentage in missing_percentage_list:

                if (corruption_percentage == 5 and missing_percentage == 20):
                    continue

                ds_path = os.path.join(
                    ds_dir, '{}_missing_{}_corruption_{}.txt'.format(
                        ds_size, missing_percentage, corruption_percentage))

                # Define input data set A:
                #
                data_set_a = dataset.DataSetCSV(description="Data set",
                                                access_mode="read",
                                                strip_fields=True,
                                                miss_val=[''],
                                                rec_ident='r',
                                                file_name=ds_path,
                                                header_line=False,
                                                delimiter=",",
                                                field_list=ds_field_list)

                # -----------------------------------------------------------------------------

                # Define field comparison functions
                #
                fc_funct_1 = comparison.FieldComparatorExactString(
                    agree_weight=1.0,
                    description="Str-Exact-field-17-field-17",
                    disagree_weight=0.0,
                    missing_weight=0.0)

                field_comp_list = [(fc_funct_1, "identifier", "identifier")]

                rec_comp = comparison.RecordComparator(data_set_a, data_set_a,
                                                       field_comp_list)

                # -----------------------------------------------------------------------------

                for keys_idx, keys in enumerate(ds_keys_list):

                    for method_idx, method in enumerate(methods):

                        #                         if(method_idx == 4):
                        #                             continue

                        for is_tight in [True, False]:

                            result_file_path = myutil.get_result_path(
                                ds_path, method, keys_idx, is_tight)
                            result_file = open(result_file_path, 'a')

                            for params_idx, params in enumerate(
                                    get_params_list(method_idx, keys_idx,
                                                    is_tight)):

                                if (is_params_not_allowed(
                                        keys_idx, method_idx, is_tight,
                                        params_idx)
                                        or myutil.is_result_already_stored(
                                            result_file_path, params_idx)):
                                    continue

                                print(params)

                                index_def = get_index_def(
                                    method_idx, keys, params, data_set_a,
                                    rec_comp)

                                # init_logger

                                for handler in logging.root.handlers[:]:
                                    logging.root.removeHandler(handler)

                                log_file_path = myutil.get_log_path(
                                    ds_path, method, keys_idx, is_tight,
                                    params_idx)
                                logging.basicConfig(filename=log_file_path,
                                                    filemode='w',
                                                    level=logging.INFO)
                                logging.getLogger()

                                # ----------------------------------------------------------------------------

                                blocking_start_time = time.time()

                                # Build and compact index
                                index_def.build()
                                index_def.compact()
                                # Do record pair comparisons
                                [field_names_list,
                                 w_vec_dict] = index_def.run()

                                blocking_end_time = time.time()
                                blocking_time = blocking_end_time - blocking_start_time

                                # -----------------------------------------------------------------------------

                                comparison_start_time = time.time()

                                # Define weight vector (record pair) classifier
                                classifier = classification.FellegiSunter(
                                    lower_threshold=0.99, upper_threshold=0.99)
                                # Unsupervised training of classifier
                                class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary
                                classifier.train(class_w_vec_dict, set(),
                                                 set())
                                # Classify all weight vectors
                                [m_set, nm_set, pm_set
                                 ] = classifier.classify(class_w_vec_dict)

                                comparison_end_time = time.time()
                                comparison_time = comparison_end_time - comparison_start_time

                                # -----------------------------------------------------------------------------

                                # Define output file options
                                #
                                histo_str_list = output.GenerateHistogram(
                                    class_w_vec_dict, 1.0)
                                print(histo_str_list)

                                match_count, recall, reduction_ratio, total_comparisons = myutil.get_metrics(
                                    ds_size, duplicate_percentage,
                                    histo_str_list)

                                # for line in histo_str_list:
                                #     print line
                                match_file_path = myutil.get_matches_path(
                                    ds_path, method, keys_idx, is_tight,
                                    params_idx)
                                output.SaveMatchStatusFile(
                                    class_w_vec_dict, m_set, match_file_path)

                                print(
                                    '{} {} {}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n'
                                    .format(method, keys_idx, params_idx,
                                            match_count, recall,
                                            reduction_ratio, total_comparisons,
                                            blocking_time, comparison_time))
                                result_file.write(
                                    '{}|{}, {:.2f}, {:.2f}, {}, {:.2f}, {:.2f}\n'
                                    .format(params_idx, match_count, recall,
                                            reduction_ratio, total_comparisons,
                                            blocking_time, comparison_time))
                                result_file.flush()