コード例 #1
0
ファイル: indexer.py プロジェクト: nnedkov/FuzzySearch
def create_indexes():
    strings = [string for string in get_all_strings() \
                                         if len(string) > QGRAM_LENGTH + 1 and \
                                            string.find('.') == -1 and \
                                            string.find('$') == -1]
    if not strings:
        raise Exception('No strings to index')

    # Threading is used because it reduces indexing time to half.
    threads = list()

    # If an exception/error occurs in any of the threads it is
    # not detectable, hence inter-thread communication is used.
    queue = Queue()

    threads.append(Thread(target=create_dense_index, args=(strings, queue)))
    threads.append(Thread(target=create_inverted_index, args=(strings, queue)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = queue.get(block=True)
        if not all_ok:
            raise error
        queue.task_done()

    for t in threads:
        t.join()
コード例 #2
0
ファイル: query.py プロジェクト: nnedkov/FuzzySearch
    for string in candidate_strings:
        edit_distance = distance(unicode(qstring), unicode(string))
        distance_cache[unicode(string)] = edit_distance
        if edit_distance < ED_THRESHOLD + 1:
            verified_approximate_matches.append(string)

    missing_matches = [(unicode(i), distance_cache[unicode(i)]) for i in verified_approximate_matches if i not in approximate_matches]
    missed_matches = [(unicode(i), distance_cache[unicode(i)]) for i in approximate_matches if i not in verified_approximate_matches]

    assert not missing_matches, 'Missing matches for %s: %s' % (qstring, missing_matches)
    assert not missed_matches, 'Missed matches for %s: %s' % (qstring, missed_matches)


if __name__ == '__main__':
    if len(argv) == 1:
        strings = get_all_strings()[139654:141000]
    else:
        strings = argv[1:]

    strings_num = len(strings)

# **********   EVALUATION   ********** #

    cand_strings_threshold_evaluation = dict()

    for cand_strings_threshold in [5, 15, 25, 35, 45, 55]:
        CAND_STRINGS_THRESHOLD = cand_strings_threshold
        cand_strings_threshold_evaluation[cand_strings_threshold] = dict()

        for i, query_string in enumerate(strings):
            try: