# Canopy indexing (threshold based) - - - - - - - - - - - - - - - - - - - - - # for q in [2,3]: for canopy_method in [('tfidf', 'threshold', 0.9, 0.8), ('jaccard', 'threshold', 0.9, 0.8), ('tfidf', 'threshold', 0.8, 0.7), ('jaccard', 'threshold', 0.8, 0.7)]: for this_index_def in index_def_list: canopy_index = indexing.CanopyIndex(desc='Canopy TH index: q=%d, %s' % \ (q, str(canopy_method)), dataset1 = data_set1, dataset2 = data_set2, rec_comparator = rec_cmp, progress=progress_precentage, index_def = this_index_def, padd = True, q = q, canopy_m = canopy_method) ds_index_list.append(['canopy-th', canopy_index]) # Canopy indexing (nearest neighbour based) - - - - - - - - - - - - - - - - - # for q in [2,3]: for canopy_method in [('tfidf', 'nearest', 5, 10), ('jaccard', 'nearest', 5, 10), ('tfidf', 'nearest', 10, 20), ('jaccard', 'nearest', 10, 20)]:
elif field_list == 'list3': param_set = [('jaccard', 'threshold', 0.353, 0.25, 2)] for i in range(len(param_set)): print 'param: {}\n'.format(param_set[i]) all_result_file.write('param: {}\n'.format( param_set[i])) index_def = indexing.CanopyIndex( dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=index_def_list, canopy_method=(param_set[i][0], param_set[i][1], param_set[i][2], param_set[i][3]), q=param_set[i][4], delete_perc=100, padded=True) index_and_classify(ds_size, corruption, index_def, index_method, field_list, i) all_result_file.flush() elif index_method == 5: print 'stringmap index with {}:\n'.format(field_list)
field_comp_list = [(fc_funct_1, "field-1", "field-1")] rec_comp = comparison.RecordComparator(data_set_a, data_set_b, field_comp_list) # ----------------------------------------------------------------------------- # Define indices for "blocking" # index_def_1 = [["field-1", "field-1", False, False, 7, [encode.get_substring, 0,6]]] index = indexing.CanopyIndex(dataset1 = data_set_a, dataset2 = data_set_b, rec_comparator = rec_comp, progress_report = 10, index_sep_str = "", skip_missing = True, index_def = [index_def_1], canopy_method = ("jaccard","threshold",.8,.3), q = 3, delete_perc = 100, padded = True) # Build and compact index # index.build() index.compact() # Do record pair comparisons # [field_names_list, w_vec_dict] = index.run()
def get_index_def(method_idx, keys, params, data_set_a, rec_comp): if method_idx == 0: index_def = indexing.DedupIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, block_method=("block", )) elif method_idx == 1: index_def = indexing.DedupIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, block_method=("sort", params[0])) elif method_idx == 2: index_def = indexing.DedupIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, block_method=("qgram", params[0], True, params[1])) elif method_idx == 3: index_def = indexing.CanopyIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, canopy_method=(params[0], params[1], params[2], params[3]), q=params[4], delete_perc=100, padded=True) elif method_idx == 4: index_def = indexing.StringMapIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, canopy_method=(params[3], params[4], params[5]), grid_resolution=params[0], dim=params[1], sub_dim=params[2], cache_dist=True, sim_funct=stringcmp.editdist) elif method_idx == 5: index_def = indexing.SuffixArrayIndex(dataset1=data_set_a, dataset2=data_set_a, progress_report=10, rec_comparator=rec_comp, index_sep_str="", skip_missing=True, index_def=keys, suffix_method=params[0], block_method=(params[1], params[2]), padded=True) return index_def