def create_indexes(): strings = [string for string in get_all_strings() \ if len(string) > QGRAM_LENGTH + 1 and \ string.find('.') == -1 and \ string.find('$') == -1] if not strings: raise Exception('No strings to index') # Threading is used because it reduces indexing time to half. threads = list() # If an exception/error occurs in any of the threads it is # not detectable, hence inter-thread communication is used. queue = Queue() threads.append(Thread(target=create_dense_index, args=(strings, queue))) threads.append(Thread(target=create_inverted_index, args=(strings, queue))) for t in threads: t.start() for t in threads: all_ok, error = queue.get(block=True) if not all_ok: raise error queue.task_done() for t in threads: t.join()
for string in candidate_strings: edit_distance = distance(unicode(qstring), unicode(string)) distance_cache[unicode(string)] = edit_distance if edit_distance < ED_THRESHOLD + 1: verified_approximate_matches.append(string) missing_matches = [(unicode(i), distance_cache[unicode(i)]) for i in verified_approximate_matches if i not in approximate_matches] missed_matches = [(unicode(i), distance_cache[unicode(i)]) for i in approximate_matches if i not in verified_approximate_matches] assert not missing_matches, 'Missing matches for %s: %s' % (qstring, missing_matches) assert not missed_matches, 'Missed matches for %s: %s' % (qstring, missed_matches) if __name__ == '__main__': if len(argv) == 1: strings = get_all_strings()[139654:141000] else: strings = argv[1:] strings_num = len(strings) # ********** EVALUATION ********** # cand_strings_threshold_evaluation = dict() for cand_strings_threshold in [5, 15, 25, 35, 45, 55]: CAND_STRINGS_THRESHOLD = cand_strings_threshold cand_strings_threshold_evaluation[cand_strings_threshold] = dict() for i, query_string in enumerate(strings): try: