def select_anagrams(token, structures): """Select possible anagrams for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Possible anagrams (keys) along with their score (values) """ anagrams = {} focus_alphabet = generate_alphabet_from_word(token[1]) token_hash = anagram_hash(token) hash_list = [] for c in structures["alphabet"]: for f in focus_alphabet: hash_list.append(token_hash + c - f) hash_counter = Counter(hash_list) # Counting retrieval occurence for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())): count = hash_counter[h] anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3] for anag in anag_list: anag_score = rate_anagram(structures["occurence_map"], token, anag, count) if anag_score > 0: anagrams[anag] = anag_score return anagrams
def build_candidates_list(token, anagrams_list, ocr_sims_list, structures): """Merge anagram and OCRkey list into one list. Parameters: token (:func:`str`): Cleaned token anagrams_list (:func:`dict`): Result of `select_anagrams` ocr_sims_list (:func:`dict`): Result of `select_ocrsims` structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Correction tokens (keys) along with their score (values) """ final_list = anagrams_list ocr_list = truncate_ocr_sim_list(token, ocr_sims_list) strong_ocr_list = ocr_list weak_ocr_list = {} if len(ocr_list) > 5: (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list) for ocr_word, ocr_score in strong_ocr_list.items(): if ocr_word in final_list.keys(): final_list[ocr_word] *= ocr_score del strong_ocr_list[ocr_word] strong_ocr_list.update(weak_ocr_list) for ocr_word, ocr_score in strong_ocr_list.items(): if ocr_word not in final_list.keys(): final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \ * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0) return final_list
def select_anagrams(token, structures): """Select possible anagrams for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Possible anagrams (keys) along with their score (values) """ anagrams = {} focus_alphabet = generate_alphabet_from_word(token[1]) token_hash = anagram_hash(token) hash_list = [] for c in structures["alphabet"]: for f in focus_alphabet: hash_list.append(token_hash + c - f) hash_counter = Counter(hash_list) # Counting retrieval occurence for h in set(hash_counter.keys()).intersection( set(structures["anagrams"].keys())): count = hash_counter[h] anag_list = [ anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3 ] for anag in anag_list: anag_score = rate_anagram(structures["occurence_map"], token, anag, count) if anag_score > 0: anagrams[anag] = anag_score return anagrams