def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        word_set = set(word_list)
        unigram_set = set(unigrams.keys())

        ocr_key_map = {
            ocr_key_list_to_str(ocr_key_hash(word)): set()
            for word in unigram_set.intersection(word_set)
        }

        # Every word contained in the mixed case map and the dictionary
        for word in unigram_set.intersection(word_set):
            h_list = ocr_key_hash(word)
            h_str = ocr_key_list_to_str(h_list)

            ocr_key_map[h_str].add(word)  # Add the word to the tab

        combine_struct = {
            key: set()
            for key in self.ocrkey_map.keys() + ocr_key_map.keys()
        }

        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.ocrkey_map = combine_struct
        self.save()
Exemple #2
0
    def append_data(self, unigrams):
        word_list = []

        aspell_dict = "models/aspell.en.dict"
        with open(aspell_dict, "r") as f:
            for line in f:
                word_list.append(line.strip("\r\n"))

        word_set = set(word_list)
        unigram_set = set(unigrams.keys())

        ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)}

        # Every word contained in the mixed case map and the dictionary
        for word in unigram_set.intersection(word_set):
            h_list = ocr_key_hash(word)
            h_str = ocr_key_list_to_str(h_list)

            ocr_key_map[h_str].add(word)  # Add the word to the tab

        combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()}

        for key, value in self.ocrkey_map.items() + ocr_key_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.ocrkey_map = combine_struct
        self.save()
Exemple #3
0
def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {
    }  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta + 1):
            if d != 0:
                card = max(int(value) + d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value) - card)

                    sim_hash_list[sim_hash_str] = [
                        (sim_word, card_diff)
                        for sim_word in structures["ocrkeys"][sim_hash_str]
                        if edit_distance(sim_word, token) <= 2
                    ]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token,
                                     sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims
Exemple #4
0
def select_ocrsims(token, structures):
    """Select similar words for a given token

    Parameters:
        token (:func:`str`): Cleaned token
        structures (:func:`dict`): Datastructures from file

    Returns:
        :func:`dict` - Similar words (keys) along with their score (values)
    """
    delta = 2
    ocr_sims = {}

    word_hash = ocr_key_hash(token)

    sim_hash_list = {}  # Using a dictionary avoid multiple entries if a key is retrieved twice
    key_index = -1

    # for (key, value) in word_hash:
    for key, value in word_hash:
        key_index += 1
        sim_hash = deepcopy(word_hash)

        for d in range(-delta, delta+1):
            if d != 0:
                card = max(int(value)+d, 1)

                sim_hash[key_index] = (key, card)

                # Rebuild OCR key string
                sim_hash_str = ""
                for k, v in sim_hash:
                    sim_hash_str += k + str(v)

                if sim_hash_str in structures["ocrkeys"]:
                    card_diff = abs(int(value)-card)

                    sim_hash_list[sim_hash_str] = [(sim_word, card_diff)
                                                   for sim_word in structures["ocrkeys"][sim_hash_str]
                                                   if edit_distance(sim_word, token) <= 2]

    for sim_hash_str, sim_list in sim_hash_list.items():
        for sim_word, card_diff in sim_list:
            sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff)

            if sim_score > 0:
                ocr_sims[sim_word] = sim_score

    return ocr_sims