def get_candidate_string_ids(qstring): qgrams = get_qgrams_from_string(qstring, QGRAM_LENGTH) if not qgrams: return list() qlength = len(qstring) valid_lengths = range(qlength-ED_THRESHOLD, qlength+ED_THRESHOLD+1) candidate_string_ids = list() for length in valid_lengths: string_ids = solve_T_occurence_problem(qlength, length, qgrams) if string_ids: candidate_string_ids += string_ids assert len(candidate_string_ids) == len(set(candidate_string_ids)) return candidate_string_ids
def _create_inverted_index(strings): inverted_index = dict() for string_id, string in enumerate(strings): string_len = len(string) try: inverted_index_len = inverted_index[string_len] except KeyError: inverted_index[string_len] = dict() inverted_index_len = inverted_index[string_len] qgrams = get_qgrams_from_string(string, QGRAM_LENGTH) for qgram in qgrams: try: inverted_index_len[qgram].add(string_id) except KeyError: inverted_index_len[qgram] = set([string_id]) set_inverted_index(inverted_index) if VERBOSITY: print 'Created inverted index'