Ejemplo n.º 1
0
    def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str,
                  neighbors: int = DEFAULT_NEIGHBORS_NUMBER,
                  edit_candidates: int = DEFAULT_EDIT_DISTANCE,
                  max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS,
                  max_corrected_length: int = 12) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \
                                First token in every line split is added to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must be two \
                                 values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param neighbors: Number of neighbors of context and typo embeddings \
                          to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup for candidates.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :param max_corrected_length: Maximum length of prefix in which symspell lookup \
                                     for typos is conducted
        """
        self.checker = SymSpell(max_dictionary_edit_distance=max_distance,
                                prefix_length=max_corrected_length)
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.neighbors_number = neighbors
        self.edit_candidates_number = edit_candidates
        self.max_distance = max_distance
        self.radius = radius
        self.tokens = read_vocabulary(vocabulary_file)
        self.frequencies = read_frequencies(frequencies_file)
Ejemplo n.º 2
0
 def _load_tree(self, tree: dict) -> None:
     self.__dict__.update(tree)
     self.tokens = split_strings(self.tokens)
     self.frequencies = {
         w: self.frequencies["vals"][i]
         for i, w in enumerate(split_strings(self.frequencies["keys"]))}
     self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance)
     self.checker.__dict__.update(tree["checker"])
     deletes = {}
     words = split_strings(self.checker._deletes["strings"])
     lengths = self.checker._deletes["lengths"]
     data = self.checker._deletes["data"]
     offset = 0
     for i, delindex in enumerate(self.checker._deletes["indexes"]):
         length = lengths[i]
         deletes[delindex] = [words[j] for j in data[offset:offset + length]]
         offset += length
     self.checker._deletes = deletes
     self.checker._words = {w: self.checker._words[i] for i, w in enumerate(words)}
     vectors = self.wv["vectors"]
     wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"], self.wv["max_n"],
                               self.wv["bucket"], True)
     wv.vectors = numpy.array(vectors)
     vocab = split_strings(self.wv["vocab"]["strings"])
     wv.vocab = {
         s: Vocab(index=i, count=self.wv["vocab"]["counts"][i])
         for i, s in enumerate(vocab)}
     wv.bucket = self.wv["bucket"]
     wv.index2word = wv.index2entity = vocab
     wv.num_ngram_vectors = self.wv["num_ngram_vectors"]
     wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"])
     wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])}
     self.wv = wv
Ejemplo n.º 3
0
    def construct(self,
                  vocabulary_file: str,
                  frequencies_file: str,
                  embeddings_file: str,
                  config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction \
                                candidates. First token in every line split is added \
                                to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must \
                                 be two values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens on \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.set_config(config)
        self.checker = SymSpell(
            max_dictionary_edit_distance=self.config["max_distance"],
            prefix_length=self.config["max_corrected_length"])
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.tokens = set(read_vocabulary(vocabulary_file))
        self.frequencies = read_frequencies(frequencies_file)
        self.min_freq = min(self.frequencies.values())
Ejemplo n.º 4
0
 def test_generate_candidates(self):
     symspell = SymSpell(max_dictionary_edit_distance=1)
     symspell.load_dictionary(VOCABULARY_FILE)
     self.assertEqual(symspell.lookup("ofset", 0, 1)[0].term, "offset")
Ejemplo n.º 5
0
def generate_vocabulary(frequencies_path: str, config: Mapping[str, Any]) -> Dict[str, int]:
    """
    Compose vocabulary from a set of tokens with known frequencies.

    Filtering of the input tokens depends on their frequencies and edit distances between them.
    All found English words and tokens that the algorithm considers word-like are added \
    regardless of their frequencies.
    :param frequencies_path: Path to the .csv file with space-separated word-frequency pairs \
                             one-per-line.
    :param config: Configuration for the vocabulary creation:
                   stable: How many tokens, which don't have more frequent \
                           edit-distance-neighbors, to take into the vocabulary.
                   suspicious: How many tokens, whose more frequent edit-distance-neighbor is
                               an English word, to take into the vocabulary.
                   non_suspicious: How many tokens, whose more frequent edit-distance-neighbor \
                                   is not an English word, to take into the vocabulary.
    :return: Dictionary with the vocabulary tokens as keys and their corresponding \
             frequencies as values.
    """
    checker = SymSpell(max_dictionary_edit_distance=2, prefix_length=100)
    checker.load_dictionary(frequencies_path)
    frequencies = read_frequencies(frequencies_path)
    sorted_frequencies = sorted(frequencies.items(), key=lambda x: -x[1])

    # For every token, find a token on edit distance 1, which has higher frequency, if there is one
    def _correct_token(token_freq):
        token, freq = token_freq
        suggestions = checker.lookup(token, 2, 1)
        if len(suggestions) > 1:
            correction = suggestions[1].term
            return correction, frequencies[correction]
        return token, freq
    corrections = list(tqdm(map(_correct_token, sorted_frequencies),
                            total=len(sorted_frequencies)))

    all_tokens = pandas.DataFrame(columns=["token", "token_freq", "correction", "correction_freq"])
    all_tokens["token"] = [token for token, _ in sorted_frequencies]
    all_tokens["token_freq"] = [freq for _, freq in sorted_frequencies]
    all_tokens["correction"] = [token_freq[0] if token_freq[1] > sorted_frequencies[i][1]
                                else sorted_frequencies[i][0]
                                for i, token_freq in enumerate(corrections)]
    all_tokens["correction_freq"] = [token_freq[1] if token_freq[1] > sorted_frequencies[i][1]
                                     else sorted_frequencies[i][1]
                                     for i, token_freq in enumerate(corrections)]
    all_tokens["rel"] = all_tokens["correction_freq"] / all_tokens["token_freq"]

    # Find all English words among all the tokens
    eng_voc = set()
    with smart_open(str(pathlib.Path(__file__).parent / "words_alpha.txt.xz"), "r") as f:
        for line in f:
            eng_voc.add(line.strip())

    # Leave only non-english tokens for analysis
    stable = all_tokens[(all_tokens.rel == 1.0) & ~all_tokens.token.isin(eng_voc)]
    unstable = all_tokens[(all_tokens.rel > 1) & ~all_tokens.token.isin(eng_voc)]

    # Get tokens and their corrections lemmas
    spacy.cli.download("en")
    nlp = spacy.load("en", disable=["parser", "ner"])

    def _lemmatize(token):
        lemm = nlp(token)
        if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or (token[-2:] == "ss" and
                                                           lemm[0].lemma_ == token[:-1]):
            return token
        return lemm[0].lemma_
    token_lemma = list(tqdm(map(_lemmatize, list(unstable.token)), total=len(unstable)))
    correction_lemma = list(tqdm(map(_lemmatize, list(unstable.correction)), total=len(unstable)))
    unstable["token_lemma"] = token_lemma
    unstable["cor_lemma"] = correction_lemma

    # Equal lemmas -> different forms of a morphologically changing token -> token is a "word"
    # Use some heuristics to remove noise
    eq_lemmas = unstable[
        (unstable["token_lemma"] == unstable["cor_lemma"]) |
        (unstable["token_lemma"] == unstable["correction"]) &
        (~unstable["correction"].isin(eng_voc) |
         (unstable["correction"].apply(lambda x: x[-3:]) == "ing"))]
    dif_lemmas = unstable[(unstable["token_lemma"] != unstable["cor_lemma"]) &
                          (unstable["token_lemma"] != unstable["correction"])]

    # Stemming heuristics
    def _norm(word: str) -> str:
        if word[-2:] == "ed" or word[-2:] == "er" or word[-1] == "s" and word[-2] != "s":
            return word[:-1]
        return word
    norm_eq = dif_lemmas[(dif_lemmas.token.apply(_norm) == dif_lemmas.correction)]

    # Gather all results
    good = all_tokens[all_tokens.token.isin(set(
        list(eq_lemmas[:].token) + list(eq_lemmas[:].correction) +
        list(norm_eq.token) + list(norm_eq.correction)))]
    unstable = unstable[~unstable.token.isin(good.token)]
    stable = stable[~stable.token.isin(good.token)]

    # Suspicious - have high probability to be typo-ed English words
    suspicious = unstable[unstable.correction.isin(eng_voc)]
    non_suspicious = unstable[~unstable.correction.isin(eng_voc)]
    vocabulary = all_tokens[all_tokens.token.isin(set(
        list(stable[:config["stable"]].token) +
        list(suspicious[:config["suspicious"]].token) +
        list(non_suspicious[:config["non_suspicious"]].token) +
        list(eng_voc) +
        list(good.token)))]
    return {token: freq for token, freq in vocabulary[["token", "token_freq"]].values}
Ejemplo n.º 6
0
class CandidatesGenerator(Model):
    """
    Looks for candidates for correction of typos and generates features \
    for them. Candidates are generated in three ways: \
    1. Closest by cosine distance of embeddings to the given token. \
    2. Closest by cosine distance to the compound vector of token context. \
    3. Closest by the edit distance and most frequent tokens from vocabulary.
    """

    NAME = "candidates_generator"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that generates candidates to fix typos."
    LICENSE = DEFAULT_LICENSE
    NO_COMPRESSION = ("/wv/vectors/", )

    def __init__(self, **kwargs):
        """Initialize a new instance of CandidatesGenerator."""
        super().__init__(**kwargs)
        self.checker = None
        self.wv = None
        self.tokens = set()
        self.frequencies = {}
        self.min_freq = 0

    def construct(self,
                  vocabulary_file: str,
                  frequencies_file: str,
                  embeddings_file: str,
                  config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction \
                                candidates. First token in every line split is added \
                                to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must \
                                 be two values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens on \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.set_config(config)
        self.checker = SymSpell(
            max_dictionary_edit_distance=self.config["max_distance"],
            prefix_length=self.config["max_corrected_length"])
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.tokens = set(read_vocabulary(vocabulary_file))
        self.frequencies = read_frequencies(frequencies_file)
        self.min_freq = min(self.frequencies.values())

    def set_config(self, config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Update candidates generation config.

        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        if config is None:
            config = {}
        self.config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["generation"],
                                  config)

    def expand_vocabulary(self, additional_tokens: Iterable[str]) -> None:
        """
        Add given tokens to the generator's vocabulary.

        :param additional_tokens: Tokens to add to the vocabulary.
        """
        self.tokens.update(additional_tokens)

    def generate_candidates(
            self,
            data: pandas.DataFrame,
            processes_number: int,
            save_candidates_file: Optional[str] = None) -> pandas.DataFrame:
        """
        Generate candidates for typos inside data.

        :param data: DataFrame which contains column Columns.Token.
        :param processes_number: Number of processes for multiprocessing.
        :param save_candidates_file: File to save candidates to.
        :return: DataFrame containing candidates for corrections \
                 and features for their ranking for each typo.
        """
        data = add_context_info(data)
        typos = [
            TypoInfo(index, token, before, after)
            for index, token, before, after in zip(data.index, data[
                Columns.Token], data[Columns.Before], data[Columns.After])
        ]
        if len(typos
               ) > self.config["start_pool_size"] and processes_number > 1:
            with Pool(min(processes_number, len(typos))) as pool:
                candidates = list(
                    tqdm(pool.imap(self._lookup_corrections_for_token,
                                   typos,
                                   chunksize=min(
                                       self.config["chunksize"],
                                       1 + len(typos) // processes_number)),
                         total=len(typos)))
        else:
            candidates = [self._lookup_corrections_for_token(t) for t in typos]
        candidates = pandas.DataFrame(list(chain.from_iterable(candidates)))
        candidates.columns = [
            Columns.Id, Columns.Token, Columns.Candidate, Columns.Features
        ]
        candidates.loc[:, Columns.Id] = candidates[Columns.Id].astype(
            data.index.dtype)
        if save_candidates_file is not None:
            candidates.to_csv(save_candidates_file, compression="xz")
        return candidates

    def dump(self) -> str:
        """
        Represent the candidates generator.
        """
        return "\n".join((
            "Vocabulary_size %d." % len(self.tokens),
            "Neighbors number %d." % self.config["neighbors_number"],
            "Maximum distance for search %d." % self.config["max_distance"],
            "Maximum distance allowed %d." % self.config["radius"],
            "Token for distance %d." % self.config["edit_dist_number"],
        ))

    def __eq__(self, other: "CandidatesGenerator") -> bool:
        def compare(first, second) -> bool:
            if isinstance(first, numpy.ndarray) or isinstance(
                    second, numpy.ndarray):
                if (first != second).any():
                    return False
            if isinstance(first, dict):
                assert isinstance(second, dict)
                for key, val in first.items():
                    val2 = second[key]
                    if hasattr(val, "__dict__"):
                        if type(val) != type(val2):
                            return False
                        if val.__dict__ != val2.__dict__:
                            return False
                    elif val != val2:
                        return False
                return True
            if first != second:
                return False
            return True

        for key in vars(self):
            if key == "_source":
                continue
            origin = getattr(self, key)
            peak = getattr(other, key)
            if key in ("checker", "wv"):
                for key2 in vars(origin):
                    if not compare(getattr(origin, key2), getattr(peak, key2)):
                        return False
            elif not compare(origin, peak):
                return False
            return True

    def _lookup_corrections_for_token(self,
                                      typo_info: TypoInfo) -> List[Features]:
        candidates = []
        candidate_tokens = self._get_candidate_tokens(typo_info)
        typo_vec = self._vec(typo_info.typo)
        dist_calc = EditDistance(typo_info.typo, "damerau")
        for candidate in set(candidate_tokens):
            candidate_vec = self.wv[candidate]
            dist = dist_calc.damerau_levenshtein_distance(
                candidate, self.config["radius"])
            if dist < 0:
                continue
            candidates.append(
                self._generate_features(typo_info, dist, typo_vec, candidate,
                                        candidate_vec))
        return candidates

    def _get_candidate_tokens(self, typo_info: TypoInfo) -> Set[str]:
        candidate_tokens = []
        last_dist = -1
        edit_candidates_count = 0
        if self.config["edit_dist_number"] > 0:
            for suggestion in self.checker.lookup(typo_info.typo, 2,
                                                  self.config["max_distance"]):
                if suggestion.distance != last_dist:
                    edit_candidates_count = 0
                    last_dist = suggestion.distance
                if edit_candidates_count >= self.config["edit_dist_number"]:
                    continue
                candidate_tokens.append(suggestion.term)
                edit_candidates_count += 1
        if self.config["neighbors_number"] > 0:
            typo_neighbors = self._closest(self._vec(typo_info.typo),
                                           self.config["neighbors_number"])
            candidate_tokens.extend(typo_neighbors)
            if len(typo_info.before + typo_info.after) > 0:
                context_neighbors = self._closest(
                    self._compound_vec("%s %s" %
                                       (typo_info.before, typo_info.after)),
                    self.config["neighbors_number"])
                candidate_tokens.extend(context_neighbors)
        candidate_tokens = {
            candidate
            for candidate in candidate_tokens if candidate in self.tokens
        }
        candidate_tokens.add(typo_info.typo)
        return candidate_tokens

    def _generate_features(
        self,
        typo_info: TypoInfo,
        dist: int,
        typo_vec: numpy.ndarray,
        candidate: str,
        candidate_vec: numpy.ndarray,
    ) -> Features:
        """
        Compile features for a single correction candidate.

        :param typo_info: instance of TypoInfo class.
        :param dist: edit distance from candidate to typo.
        :param typo_vec: embedding of the original token.
        :param candidate: candidate token.
        :param candidate_vec: embedding of the candidate token.
        :return: index, typo and candidate tokens, frequencies info, \
                 cosine distances between embeggings and contexts, \
                 edit distance between the tokens, \
                 embeddings of the tokens and contexts.
        """
        context = "%s %s" % (typo_info.before, typo_info.after)
        before_vec = self._compound_vec(typo_info.before)
        after_vec = self._compound_vec(typo_info.after)
        context_vec = self._compound_vec(context)
        return Features(
            typo_info.index, typo_info.typo, candidate,
            numpy.concatenate(((
                self._freq(typo_info.typo),
                self._freq(candidate),
                self._freq_relation(typo_info.typo, candidate),
                self._cos(typo_vec, before_vec),
                self._cos(typo_vec, after_vec),
                self._cos(typo_vec, context_vec),
                self._cos(candidate_vec, before_vec),
                self._cos(candidate_vec, after_vec),
                self._cos(candidate_vec, context_vec),
                self._avg_cos(typo_vec, typo_info.before),
                self._avg_cos(typo_vec, typo_info.after),
                self._avg_cos(typo_vec, context),
                self._avg_cos(candidate_vec, typo_info.before),
                self._avg_cos(candidate_vec, typo_info.after),
                self._avg_cos(candidate_vec, context),
                self._min_cos(typo_vec, typo_info.before),
                self._min_cos(typo_vec, typo_info.after),
                self._min_cos(typo_vec, context),
                self._min_cos(candidate_vec, typo_info.before),
                self._min_cos(candidate_vec, typo_info.after),
                self._min_cos(candidate_vec, context),
                self._cos(typo_vec, candidate_vec),
                dist,
                int(candidate in self.tokens),
            ), before_vec, after_vec, typo_vec, candidate_vec,
                               context_vec), ).astype(numpy.float32))

    def _vec(self, token: str) -> numpy.ndarray:
        return self.wv[token]

    def _freq(self, token: str) -> float:
        return float(self.frequencies.get(token, self.min_freq))

    @staticmethod
    def _cos(first_vec: numpy.ndarray, second_vec: numpy.ndarray) -> float:
        if numpy.linalg.norm(first_vec) * numpy.linalg.norm(second_vec) != 0:
            return cosine(first_vec, second_vec)
        return 1.0

    def _min_cos(self, typo_vec: numpy.ndarray, context: str) -> float:
        if not len(context.split()):
            return 1.0
        return min([2.0] + [
            self._cos(typo_vec, self._vec(token)) for token in context.split()
        ])

    def _avg_cos(self, typo_vec: numpy.ndarray, context: str) -> float:
        if not len(context.split()):
            return 1.0
        return sum([
            self._cos(typo_vec, self._vec(token)) for token in context.split()
        ]) / len(context.split())

    def _closest(self, item: Union[numpy.ndarray, str],
                 quantity: int) -> List[str]:
        return [
            token for token, _ in self.wv.most_similar([item], topn=quantity)
        ]

    def _freq_relation(self, first_token: str, second_token: str) -> float:
        return -numpy.log((1.0 * self._freq(first_token) + 1e-5) /
                          (1.0 * self._freq(second_token) + 1e-5))

    def _compound_vec(self, text: str) -> numpy.ndarray:
        split = text.split()
        compound_vec = numpy.zeros(self.wv["a"].shape)
        for token in split:
            compound_vec += self.wv[token]
        return compound_vec

    def _generate_tree(self) -> dict:
        tree = self.__dict__.copy()

        class DummyModel(Model):
            NAME = "dummy"
            VENDOR = "dummy"
            DESCRIPTION = "dummy"

        for key in vars(DummyModel()):
            del tree[key]
        freqkeys = [""] * len(self.frequencies)
        freqvals = numpy.zeros(len(self.frequencies), dtype=numpy.uint32)
        for i, (key, val) in enumerate(sorted(self.frequencies.items())):
            freqkeys[i] = key
            freqvals[i] = val
        tree["frequencies"] = {
            "keys": merge_strings(freqkeys),
            "vals": freqvals
        }
        tree["checker"] = self.checker.__dict__.copy()
        delstrs = set()
        delindexes = numpy.zeros(len(self.checker._deletes),
                                 dtype=numpy.uint32)
        dellengths = numpy.zeros_like(delindexes)
        for i, (key, dss) in enumerate(self.checker._deletes.items()):
            delindexes[i] = key
            dellengths[i] = len(dss)
            for ds in dss:
                delstrs.add(ds)
        delstrs = sorted(delstrs)
        delstrs_map = {s: i for i, s in enumerate(delstrs)}
        deldata = numpy.zeros(sum(dellengths), dtype=numpy.uint32)
        offset = 0
        for di in delindexes:
            dss = self.checker._deletes[di]
            for j, ds in enumerate(dss):
                deldata[offset + j] = delstrs_map[ds]
            offset += len(dss)
        tree["checker"]["_deletes"] = {
            "strings": merge_strings(delstrs),
            "indexes": delindexes,
            "lengths": dellengths,
            "data": deldata,
        }
        wordvals = numpy.zeros(len(self.checker._words), dtype=numpy.uint32)
        for key, val in self.checker._words.items():
            wordvals[delstrs_map[key]] = val
        tree["checker"]["_words"] = wordvals
        tree["tokens"] = merge_strings(sorted(self.tokens))
        vocab_strings = [""] * len(self.wv.vocab)
        vocab_counts = numpy.zeros(len(vocab_strings), dtype=numpy.uint32)
        for key, val in self.wv.vocab.items():
            vocab_strings[val.index] = key
            vocab_counts[val.index] = val.count
        hash2index = numpy.zeros(len(self.wv.hash2index), dtype=numpy.uint32)
        for key, val in self.wv.hash2index.items():
            hash2index[val] = key
        tree["wv"] = {
            "vocab": {
                "strings": merge_strings(vocab_strings),
                "counts": vocab_counts
            },
            "vectors": self.wv.vectors,
            "min_n": self.wv.min_n,
            "max_n": self.wv.max_n,
            "bucket": self.wv.bucket,
            "num_ngram_vectors": self.wv.num_ngram_vectors,
            "vectors_ngrams": self.wv.vectors_ngrams,
            "hash2index": hash2index,
        }
        return tree

    def _load_tree(self, tree: dict) -> None:
        self.__dict__.update(tree)
        self.tokens = set(split_strings(self.tokens))
        self.frequencies = {
            w: self.frequencies["vals"][i]
            for i, w in enumerate(split_strings(self.frequencies["keys"]))
        }
        self.checker = SymSpell(
            max_dictionary_edit_distance=self.config["max_distance"])
        self.checker.__dict__.update(tree["checker"])
        deletes = {}
        words = split_strings(self.checker._deletes["strings"])
        lengths = self.checker._deletes["lengths"]
        data = self.checker._deletes["data"]
        offset = 0
        for i, delindex in enumerate(self.checker._deletes["indexes"]):
            length = lengths[i]
            deletes[delindex] = [
                words[j] for j in data[offset:offset + length]
            ]
            offset += length
        self.checker._deletes = deletes
        self.checker._words = {
            w: self.checker._words[i]
            for i, w in enumerate(words)
        }
        vectors = self.wv["vectors"]
        wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"],
                                  self.wv["max_n"], self.wv["bucket"], True)
        wv.vectors = numpy.array(vectors)
        vocab = split_strings(self.wv["vocab"]["strings"])
        wv.vocab = {
            s: Vocab(index=i, count=self.wv["vocab"]["counts"][i])
            for i, s in enumerate(vocab)
        }
        wv.bucket = self.wv["bucket"]
        wv.index2word = wv.index2entity = vocab
        wv.num_ngram_vectors = self.wv["num_ngram_vectors"]
        wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"])
        wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])}
        self.wv = wv
Ejemplo n.º 7
0
class CandidatesGenerator(Model):
    """
    Looks for candidates for correction of typos and generates features \
    for them. Candidates are generated in three ways: \
    1. Closest by cosine distance of embeddings to the given token. \
    2. Closest by cosine distance to the compound vector of token context. \
    3. Closest by the edit distance and most frequent tokens from vocabulary.
    """

    NAME = "candidates_generator"
    VENDOR = "source{d}"
    NO_COMPRESSION = ("/wv/vectors/", )
    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_DISTANCE = 20

    def __init__(self, **kwargs):
        """Initialize a new instance of CandidatesGenerator."""
        super().__init__(**kwargs)
        self.checker = None
        self.wv = None
        self.neighbors_number = self.DEFAULT_NEIGHBORS_NUMBER
        self.edit_candidates_number = self.DEFAULT_EDIT_DISTANCE
        self.max_distance = self.DEFAULT_MAX_DISTANCE
        self.radius = self.DEFAULT_RADIUS
        self.tokens = []
        self.frequencies = {}

    def construct(self,
                  vocabulary_file: str,
                  frequencies_file: str,
                  embeddings_file: str,
                  neighbors: int = DEFAULT_NEIGHBORS_NUMBER,
                  edit_candidates: int = DEFAULT_EDIT_DISTANCE,
                  max_distance: int = DEFAULT_MAX_DISTANCE,
                  radius: int = DEFAULT_RADIUS,
                  max_corrected_length: int = 12) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \
                                First token in every line split is added to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must be two \
                                 values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param neighbors: Number of neighbors of context and typo embeddings \
                          to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup for candidates.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :param max_corrected_length: Maximum length of prefix in which symspell lookup \
                                     for typos is conducted
        """
        self.checker = SymSpell(max_dictionary_edit_distance=max_distance,
                                prefix_length=max_corrected_length)
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.neighbors_number = neighbors
        self.edit_candidates_number = edit_candidates
        self.max_distance = max_distance
        self.radius = radius
        self.tokens = read_vocabulary(vocabulary_file)
        self.frequencies = read_frequencies(frequencies_file)

    def generate_candidates(self,
                            data: pandas.DataFrame,
                            threads_number: int,
                            save_candidates_file: str = None,
                            start_pool_size: int = 64) -> pandas.DataFrame:
        """
        Generate candidates for typos inside data.

        :param data: DataFrame, containing column TYPO_COLUMN.
        :param threads_number: Number of threads for multiprocessing.
        :param save_candidates_file: File to save candidates to.
        :param start_pool_size: Length of data, starting from which multiprocessing is desired.
        :return: DataFrame containing candidates for corrections \
                 and features for their ranking for each typo.
        """
        data = add_context_info(data)
        typos = [
            TypoInfo(index, data.loc[index].typo, data.loc[index].before,
                     data.loc[index].after)
            for i, index in enumerate(data.index)
        ]
        if len(typos) > start_pool_size and threads_number > 1:
            with Pool(min(threads_number, len(typos))) as pool:
                candidates = list(
                    tqdm(pool.imap(self._lookup_corrections_for_token,
                                   typos,
                                   chunksize=min(
                                       256, 1 + len(typos) // threads_number)),
                         total=len(typos)))
        else:
            candidates = [self._lookup_corrections_for_token(t) for t in typos]
        candidates = pandas.DataFrame(list(chain.from_iterable(candidates)))
        candidates.columns = [
            ID_COLUMN, TYPO_COLUMN, CANDIDATE_COLUMN, FEATURES_COLUMN
        ]
        candidates[ID_COLUMN] = candidates[ID_COLUMN].astype(data.index.dtype)
        if save_candidates_file is not None:
            candidates.to_pickle(save_candidates_file)
        return candidates

    def dump(self) -> str:
        """
        Represent the candidates generator.
        """
        return "\n".join((
            "Vocabulary_size %d." % len(self.tokens),
            "Neighbors number %d." % self.neighbors_number,
            "Maximum distance for search %d." % self.max_distance,
            "Maximum distance allowed %d." % self.radius,
            "Token for distance %d." % self.edit_candidates_number,
        ))

    def __eq__(self, other: "CandidatesGenerator") -> bool:
        def compare(first, second) -> bool:
            if isinstance(first, numpy.ndarray) or isinstance(
                    second, numpy.ndarray):
                if (first != second).any():
                    return False
            if isinstance(first, dict):
                assert isinstance(second, dict)
                for key, val in first.items():
                    val2 = second[key]
                    if hasattr(val, "__dict__"):
                        if type(val) != type(val2):
                            return False
                        if val.__dict__ != val2.__dict__:
                            return False
                    elif val != val2:
                        return False
                return True
            if first != second:
                return False
            return True

        for key in vars(self):
            if key == "_source":
                continue
            origin = getattr(self, key)
            peak = getattr(other, key)
            if key in ("checker", "wv"):
                for key2 in vars(origin):
                    if not compare(getattr(origin, key2), getattr(peak, key2)):
                        return False
            elif not compare(origin, peak):
                return False
            return True

    def _lookup_corrections_for_token(self,
                                      typo_info: TypoInfo) -> List[Features]:
        candidates = []
        candidate_tokens = self._get_candidate_tokens(typo_info)
        typo_vec = self._vec(typo_info.typo)
        dist_calc = EditDistance(typo_info.typo, "damerau")
        for candidate in set(candidate_tokens):
            candidate_vec = self.wv[candidate]
            dist = dist_calc.damerau_levenshtein_distance(
                candidate, self.radius)

            if dist < 0:
                continue
            candidates.append(
                self._generate_features(typo_info, dist, typo_vec, candidate,
                                        candidate_vec))

        return candidates

    def _get_candidate_tokens(self, typo_info: TypoInfo) -> Set[str]:
        candidate_tokens = []
        last_dist = -1
        edit_candidates_count = 0
        if self.edit_candidates_number > 0:
            for suggestion in self.checker.lookup(typo_info.typo, 2,
                                                  self.max_distance):
                if suggestion.distance != last_dist:
                    edit_candidates_count = 0
                    last_dist = suggestion.distance
                if edit_candidates_count >= self.edit_candidates_number:
                    continue
                candidate_tokens.append(suggestion.term)
                edit_candidates_count += 1
        if self.neighbors_number > 0:
            typo_neighbors = self._closest(self._vec(typo_info.typo),
                                           self.neighbors_number)
            candidate_tokens.extend(typo_neighbors)

            if len(typo_info.before + typo_info.after) > 0:
                context_neighbors = self._closest(
                    self._compound_vec(typo_info.before + typo_info.after),
                    self.neighbors_number)
                candidate_tokens.extend(context_neighbors)

        candidate_tokens = {
            candidate
            for candidate in candidate_tokens if candidate in self.tokens
        }
        if not len(candidate_tokens):
            candidate_tokens.add(typo_info.typo)
        return candidate_tokens

    def _generate_features(
        self,
        typo_info: TypoInfo,
        dist: int,
        typo_vec: numpy.ndarray,
        candidate: str,
        candidate_vec: numpy.ndarray,
    ) -> Features:
        """
        Compile features for a single correction candidate.

        :param typo_info: instance of TypoInfo class.
        :param dist: edit distance from candidate to typo.
        :param typo_vec: embedding of the original token.
        :param candidate: candidate token.
        :param candidate_vec: embedding of the candidate token.
        :return: index, typo and candidate tokens, frequencies info, \
                 cosine distances between embeggings and contexts, \
                 edit distance between the tokens, \
                 embeddings of the tokens and contexts.
        """
        before_vec = self._compound_vec(typo_info.before)
        after_vec = self._compound_vec(typo_info.after)
        context_vec = self._compound_vec(typo_info.before + typo_info.after)
        return Features(
            typo_info.index, typo_info.typo, candidate,
            numpy.concatenate(((
                self._freq(typo_info.typo),
                self._freq(candidate),
                self._freq_relation(typo_info.typo, candidate),
                self._cos(typo_vec, before_vec),
                self._cos(typo_vec, after_vec),
                self._cos(typo_vec, context_vec),
                self._cos(candidate_vec, before_vec),
                self._cos(candidate_vec, after_vec),
                self._cos(candidate_vec, context_vec),
                self._cos(typo_vec, candidate_vec),
                dist,
            ), before_vec, after_vec, typo_vec, candidate_vec,
                               context_vec), ).astype(numpy.float32))

    def _vec(self, token: str) -> numpy.ndarray:
        return self.wv[token]

    def _freq(self, token: str) -> float:
        return float(self.frequencies.get(token, 0))

    @staticmethod
    def _cos(first_vec: numpy.ndarray, second_vec: numpy.ndarray) -> float:
        if numpy.linalg.norm(first_vec) * numpy.linalg.norm(second_vec) != 0:
            return cosine(first_vec, second_vec)
        return 1.0

    def _closest(self, item: Union[numpy.ndarray, str],
                 quantity: int) -> List[str]:
        return [
            token for token, _ in self.wv.most_similar([item], topn=quantity)
        ]

    def _freq_relation(self, first_token: str, second_token: str) -> float:
        return -numpy.log((1.0 * self._freq(first_token) + 1e-5) /
                          (1.0 * self._freq(second_token) + 1e-5))

    def _compound_vec(self, split: List[str]) -> numpy.ndarray:
        compound_vec = numpy.zeros(self.wv["a"].shape)
        if len(split) == 0:
            return compound_vec
        else:
            for token in split:
                compound_vec += self.wv[token]
        return compound_vec

    def _generate_tree(self) -> dict:
        tree = self.__dict__.copy()
        for key in vars(Model()):
            del tree[key]
        freqkeys = [""] * len(self.frequencies)
        freqvals = numpy.zeros(len(self.frequencies), dtype=numpy.uint32)
        for i, (key, val) in enumerate(sorted(self.frequencies.items())):
            freqkeys[i] = key
            freqvals[i] = val
        tree["frequencies"] = {
            "keys": merge_strings(freqkeys),
            "vals": freqvals
        }
        tree["checker"] = self.checker.__dict__.copy()
        delstrs = set()
        delindexes = numpy.zeros(len(self.checker._deletes),
                                 dtype=numpy.uint32)
        dellengths = numpy.zeros_like(delindexes)
        for i, (key, dss) in enumerate(self.checker._deletes.items()):
            delindexes[i] = key
            dellengths[i] = len(dss)
            for ds in dss:
                delstrs.add(ds)
        delstrs = sorted(delstrs)
        delstrs_map = {s: i for i, s in enumerate(delstrs)}
        deldata = numpy.zeros(sum(dellengths), dtype=numpy.uint32)
        offset = 0
        for di in delindexes:
            dss = self.checker._deletes[di]
            for j, ds in enumerate(dss):
                deldata[offset + j] = delstrs_map[ds]
            offset += len(dss)
        tree["checker"]["_deletes"] = {
            "strings": merge_strings(delstrs),
            "indexes": delindexes,
            "lengths": dellengths,
            "data": deldata,
        }
        wordvals = numpy.zeros(len(self.checker._words), dtype=numpy.uint32)
        for key, val in self.checker._words.items():
            wordvals[delstrs_map[key]] = val
        tree["checker"]["_words"] = wordvals
        tree["tokens"] = merge_strings(self.tokens)
        vocab_strings = [""] * len(self.wv.vocab)
        vocab_counts = numpy.zeros(len(vocab_strings), dtype=numpy.uint32)
        for key, val in self.wv.vocab.items():
            vocab_strings[val.index] = key
            vocab_counts[val.index] = val.count
        hash2index = numpy.zeros(len(self.wv.hash2index), dtype=numpy.uint32)
        for key, val in self.wv.hash2index.items():
            hash2index[val] = key
        tree["wv"] = {
            "vocab": {
                "strings": merge_strings(vocab_strings),
                "counts": vocab_counts
            },
            "vectors": self.wv.vectors,
            "min_n": self.wv.min_n,
            "max_n": self.wv.max_n,
            "bucket": self.wv.bucket,
            "num_ngram_vectors": self.wv.num_ngram_vectors,
            "vectors_ngrams": self.wv.vectors_ngrams,
            "hash2index": hash2index,
        }
        return tree

    def _load_tree(self, tree: dict) -> None:
        self.__dict__.update(tree)
        self.tokens = split_strings(self.tokens)
        self.frequencies = {
            w: self.frequencies["vals"][i]
            for i, w in enumerate(split_strings(self.frequencies["keys"]))
        }
        self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance)
        self.checker.__dict__.update(tree["checker"])
        deletes = {}
        words = split_strings(self.checker._deletes["strings"])
        lengths = self.checker._deletes["lengths"]
        data = self.checker._deletes["data"]
        offset = 0
        for i, delindex in enumerate(self.checker._deletes["indexes"]):
            length = lengths[i]
            deletes[delindex] = [
                words[j] for j in data[offset:offset + length]
            ]
            offset += length
        self.checker._deletes = deletes
        self.checker._words = {
            w: self.checker._words[i]
            for i, w in enumerate(words)
        }
        vectors = self.wv["vectors"]
        wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"],
                                  self.wv["max_n"])
        wv.vectors = vectors
        vocab = split_strings(self.wv["vocab"]["strings"])
        wv.vocab = {
            s: Vocab(index=i, count=self.wv["vocab"]["counts"][i])
            for i, s in enumerate(vocab)
        }
        wv.bucket = self.wv["bucket"]
        wv.index2word = wv.index2entity = vocab
        wv.num_ngram_vectors = self.wv["num_ngram_vectors"]
        wv.vectors_ngrams = self.wv["vectors_ngrams"]
        wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])}
        self.wv = wv