Esempio n. 1
0
def select_anagrams(token, structures):
    """Select possible anagrams for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Possible anagrams (keys) along with their score (values)
    """
    anagrams = {}
    focus_alphabet = generate_alphabet_from_word(token[1])
    token_hash = anagram_hash(token)

    hash_list = []
    for c in structures["alphabet"]:
        for f in focus_alphabet:
            hash_list.append(token_hash + c - f)

    hash_counter = Counter(hash_list)  # Counting retrieval occurence

    for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())):
        count = hash_counter[h]
        anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3]

        for anag in anag_list:
            anag_score = rate_anagram(structures["occurence_map"], token, anag, count)

            if anag_score > 0:
                anagrams[anag] = anag_score

    return anagrams
Esempio n. 2
0
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures):
    """Merge anagram and OCRkey list into one list.

    Parameters:
        token (:func:`str`): Cleaned token
        anagrams_list (:func:`dict`): Result of `select_anagrams`
        ocr_sims_list (:func:`dict`): Result of `select_ocrsims`
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Correction tokens (keys) along with their score (values)
    """
    final_list = anagrams_list

    ocr_list = truncate_ocr_sim_list(token, ocr_sims_list)

    strong_ocr_list = ocr_list
    weak_ocr_list = {}
    if len(ocr_list) > 5:
        (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word in final_list.keys():
            final_list[ocr_word] *= ocr_score
            del strong_ocr_list[ocr_word]

    strong_ocr_list.update(weak_ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word not in final_list.keys():
            final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \
                * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0)

    return final_list
Esempio n. 3
0
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures):
    """Merge anagram and OCRkey list into one list.

    Parameters:
        token (:func:`str`): Cleaned token
        anagrams_list (:func:`dict`): Result of `select_anagrams`
        ocr_sims_list (:func:`dict`): Result of `select_ocrsims`
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Correction tokens (keys) along with their score (values)
    """
    final_list = anagrams_list

    ocr_list = truncate_ocr_sim_list(token, ocr_sims_list)

    strong_ocr_list = ocr_list
    weak_ocr_list = {}
    if len(ocr_list) > 5:
        (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word in final_list.keys():
            final_list[ocr_word] *= ocr_score
            del strong_ocr_list[ocr_word]

    strong_ocr_list.update(weak_ocr_list)

    for ocr_word, ocr_score in strong_ocr_list.items():
        if ocr_word not in final_list.keys():
            final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \
                * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0)

    return final_list
Esempio n. 4
0
def select_anagrams(token, structures):
    """Select possible anagrams for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Possible anagrams (keys) along with their score (values)
    """
    anagrams = {}
    focus_alphabet = generate_alphabet_from_word(token[1])
    token_hash = anagram_hash(token)

    hash_list = []
    for c in structures["alphabet"]:
        for f in focus_alphabet:
            hash_list.append(token_hash + c - f)

    hash_counter = Counter(hash_list)  # Counting retrieval occurence

    for h in set(hash_counter.keys()).intersection(
            set(structures["anagrams"].keys())):
        count = hash_counter[h]
        anag_list = [
            anag for anag in structures["anagrams"][h]
            if edit_distance(anag, token) <= 3
        ]

        for anag in anag_list:
            anag_score = rate_anagram(structures["occurence_map"], token, anag,
                                      count)

            if anag_score > 0:
                anagrams[anag] = anag_score

    return anagrams