def remove_false_positives(qstring, candidate_strings, candidate_string_attrs=None): start_time = int(round(time() * 1000000)) qlength = len(qstring) if candidate_string_attrs and len(candidate_strings) > CAND_STRINGS_THRESHOLD: qelements = get_string_elements(qstring) filtered_candidate_strings = list() for string in candidate_strings: elements, length = candidate_string_attrs[string] if ed_property_is_satisfied(qelements, elements, qlength == length): filtered_candidate_strings.append(string) #print '# of candidate strings before filtering: %s' % len(candidate_strings) #print '# of candidate strings after filtering: %s' % len(filtered_candidate_strings) candidate_strings = filtered_candidate_strings approximate_matches = list() for string in candidate_strings: length = len(string) is_not_false_positive = strings_are_within_distance_K(qstring, string, qlength, length, K=ED_THRESHOLD+1) if is_not_false_positive: approximate_matches.append(string) end_time = int(round(time() * 1000000)) return approximate_matches, end_time - start_time
def _create_dense_index(strings): dense_index = dict() for i, string in enumerate(strings): string_elements = get_string_elements(string) dense_index[i] = (string, string_elements, len(string)) set_dense_index(dense_index) if VERBOSITY: print 'Created dense index'