Ejemplo n.º 1
0
  # Canopy indexing (threshold based) - - - - - - - - - - - - - - - - - - - - -
  #
  for q in [2,3]:
    for canopy_method in [('tfidf',   'threshold', 0.9, 0.8),
                          ('jaccard', 'threshold', 0.9, 0.8),
                          ('tfidf',   'threshold', 0.8, 0.7),
                          ('jaccard', 'threshold', 0.8, 0.7)]:

      for this_index_def in index_def_list:

        canopy_index = indexing.CanopyIndex(desc='Canopy TH index: q=%d, %s' % \
                                       (q, str(canopy_method)),
                                       dataset1 = data_set1,
                                       dataset2 = data_set2,
                                       rec_comparator = rec_cmp,
                                       progress=progress_precentage,
                                       index_def = this_index_def,
                                       padd = True,
                                       q = q,
                                       canopy_m = canopy_method)

        ds_index_list.append(['canopy-th', canopy_index])

  # Canopy indexing (nearest neighbour based) - - - - - - - - - - - - - - - - -
  #
  for q in [2,3]:
    for canopy_method in [('tfidf',   'nearest',  5, 10),
                          ('jaccard', 'nearest',  5, 10),
                          ('tfidf',   'nearest', 10, 20),
                          ('jaccard', 'nearest', 10, 20)]:
Ejemplo n.º 2
0
                    elif field_list == 'list3':
                        param_set = [('jaccard', 'threshold', 0.353, 0.25, 2)]

                    for i in range(len(param_set)):

                        print 'param: {}\n'.format(param_set[i])
                        all_result_file.write('param: {}\n'.format(
                            param_set[i]))

                        index_def = indexing.CanopyIndex(
                            dataset1=data_set_a,
                            dataset2=data_set_a,
                            progress_report=10,
                            rec_comparator=rec_comp,
                            index_sep_str="",
                            skip_missing=True,
                            index_def=index_def_list,
                            canopy_method=(param_set[i][0], param_set[i][1],
                                           param_set[i][2], param_set[i][3]),
                            q=param_set[i][4],
                            delete_perc=100,
                            padded=True)

                        index_and_classify(ds_size, corruption, index_def,
                                           index_method, field_list, i)

                        all_result_file.flush()

                elif index_method == 5:

                    print 'stringmap index with {}:\n'.format(field_list)
Ejemplo n.º 3
0
field_comp_list = [(fc_funct_1, "field-1", "field-1")]

rec_comp = comparison.RecordComparator(data_set_a, data_set_b, field_comp_list)

# -----------------------------------------------------------------------------

# Define indices for "blocking"
#
index_def_1 = [["field-1", "field-1", False, False, 7, [encode.get_substring, 0,6]]]

index = indexing.CanopyIndex(dataset1 = data_set_a,
                             dataset2 = data_set_b,
                             rec_comparator = rec_comp,
                             progress_report = 10,
                             index_sep_str = "",
                             skip_missing = True,
                             index_def = [index_def_1],
                             canopy_method = ("jaccard","threshold",.8,.3),
                             q = 3,
                             delete_perc = 100,
                             padded = True)

# Build and compact index
#
index.build()

index.compact()

# Do record pair comparisons
#
[field_names_list, w_vec_dict] = index.run()
Ejemplo n.º 4
0
def get_index_def(method_idx, keys, params, data_set_a, rec_comp):
    if method_idx == 0:

        index_def = indexing.DedupIndex(dataset1=data_set_a,
                                        dataset2=data_set_a,
                                        progress_report=10,
                                        rec_comparator=rec_comp,
                                        index_sep_str="",
                                        skip_missing=True,
                                        index_def=keys,
                                        block_method=("block", ))

    elif method_idx == 1:

        index_def = indexing.DedupIndex(dataset1=data_set_a,
                                        dataset2=data_set_a,
                                        progress_report=10,
                                        rec_comparator=rec_comp,
                                        index_sep_str="",
                                        skip_missing=True,
                                        index_def=keys,
                                        block_method=("sort", params[0]))

    elif method_idx == 2:

        index_def = indexing.DedupIndex(dataset1=data_set_a,
                                        dataset2=data_set_a,
                                        progress_report=10,
                                        rec_comparator=rec_comp,
                                        index_sep_str="",
                                        skip_missing=True,
                                        index_def=keys,
                                        block_method=("qgram", params[0], True,
                                                      params[1]))

    elif method_idx == 3:

        index_def = indexing.CanopyIndex(dataset1=data_set_a,
                                         dataset2=data_set_a,
                                         progress_report=10,
                                         rec_comparator=rec_comp,
                                         index_sep_str="",
                                         skip_missing=True,
                                         index_def=keys,
                                         canopy_method=(params[0], params[1],
                                                        params[2], params[3]),
                                         q=params[4],
                                         delete_perc=100,
                                         padded=True)

    elif method_idx == 4:

        index_def = indexing.StringMapIndex(dataset1=data_set_a,
                                            dataset2=data_set_a,
                                            progress_report=10,
                                            rec_comparator=rec_comp,
                                            index_sep_str="",
                                            skip_missing=True,
                                            index_def=keys,
                                            canopy_method=(params[3],
                                                           params[4],
                                                           params[5]),
                                            grid_resolution=params[0],
                                            dim=params[1],
                                            sub_dim=params[2],
                                            cache_dist=True,
                                            sim_funct=stringcmp.editdist)

    elif method_idx == 5:

        index_def = indexing.SuffixArrayIndex(dataset1=data_set_a,
                                              dataset2=data_set_a,
                                              progress_report=10,
                                              rec_comparator=rec_comp,
                                              index_sep_str="",
                                              skip_missing=True,
                                              index_def=keys,
                                              suffix_method=params[0],
                                              block_method=(params[1],
                                                            params[2]),
                                              padded=True)

    return index_def