def build_candidates_list(token, anagrams_list, ocr_sims_list, structures): """Merge anagram and OCRkey list into one list. Parameters: token (:func:`str`): Cleaned token anagrams_list (:func:`dict`): Result of `select_anagrams` ocr_sims_list (:func:`dict`): Result of `select_ocrsims` structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Correction tokens (keys) along with their score (values) """ final_list = anagrams_list ocr_list = truncate_ocr_sim_list(token, ocr_sims_list) strong_ocr_list = ocr_list weak_ocr_list = {} if len(ocr_list) > 5: (strong_ocr_list, weak_ocr_list) = split_ocr_list(token, ocr_list) for ocr_word, ocr_score in strong_ocr_list.items(): if ocr_word in final_list.keys(): final_list[ocr_word] *= ocr_score del strong_ocr_list[ocr_word] strong_ocr_list.update(weak_ocr_list) for ocr_word, ocr_score in strong_ocr_list.items(): if ocr_word not in final_list.keys(): final_list[ocr_word] = rate_anagram(structures["occurence_map"], token, ocr_word, 1) \ * rate_ocr_key(structures["occurence_map"], token, ocr_word, 0) return final_list
def select_ocrsims(token, structures): """Select similar words for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Similar words (keys) along with their score (values) """ delta = 2 ocr_sims = {} word_hash = ocr_key_hash(token) sim_hash_list = { } # Using a dictionary avoid multiple entries if a key is retrieved twice key_index = -1 # for (key, value) in word_hash: for key, value in word_hash: key_index += 1 sim_hash = deepcopy(word_hash) for d in range(-delta, delta + 1): if d != 0: card = max(int(value) + d, 1) sim_hash[key_index] = (key, card) # Rebuild OCR key string sim_hash_str = "" for k, v in sim_hash: sim_hash_str += k + str(v) if sim_hash_str in structures["ocrkeys"]: card_diff = abs(int(value) - card) sim_hash_list[sim_hash_str] = [ (sim_word, card_diff) for sim_word in structures["ocrkeys"][sim_hash_str] if edit_distance(sim_word, token) <= 2 ] for sim_hash_str, sim_list in sim_hash_list.items(): for sim_word, card_diff in sim_list: sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) if sim_score > 0: ocr_sims[sim_word] = sim_score return ocr_sims
def select_ocrsims(token, structures): """Select similar words for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Similar words (keys) along with their score (values) """ delta = 2 ocr_sims = {} word_hash = ocr_key_hash(token) sim_hash_list = {} # Using a dictionary avoid multiple entries if a key is retrieved twice key_index = -1 # for (key, value) in word_hash: for key, value in word_hash: key_index += 1 sim_hash = deepcopy(word_hash) for d in range(-delta, delta+1): if d != 0: card = max(int(value)+d, 1) sim_hash[key_index] = (key, card) # Rebuild OCR key string sim_hash_str = "" for k, v in sim_hash: sim_hash_str += k + str(v) if sim_hash_str in structures["ocrkeys"]: card_diff = abs(int(value)-card) sim_hash_list[sim_hash_str] = [(sim_word, card_diff) for sim_word in structures["ocrkeys"][sim_hash_str] if edit_distance(sim_word, token) <= 2] for sim_hash_str, sim_list in sim_hash_list.items(): for sim_word, card_diff in sim_list: sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) if sim_score > 0: ocr_sims[sim_word] = sim_score return ocr_sims