Example #1
0
 def test_read_functions(self):
     vocabulary = read_vocabulary(
         join(TEST_DATA_PATH, "test_frequencies.csv.xz"))
     frequencies = read_frequencies(
         join(TEST_DATA_PATH, "test_frequencies.csv.xz"))
     self.assertEqual(len(vocabulary), 100)
     self.assertSetEqual(set(vocabulary), set(frequencies.keys()))
Example #2
0
 def test_prepare_data_with_load(self):
     with tempfile.TemporaryDirectory(
             prefix="lookout_typos_prepare_load_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "dataset_url": "https://docs.google.com/uc?export=download&"
             "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo",
             "input_path": None,
             "raw_data_filename": "raw_test_data.csv.xz",
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(
             os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token])))
         frequencies = read_frequencies(
             os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token,
                          Columns.Split}.issubset(data.columns))
 def test_prepare_data_from_file(self):
     temp_dir = mkdtemp()
     params = {
         "data_dir":
         temp_dir,
         "input_path":
         str(pathlib.Path(__file__).parent / "raw_test_data.csv.xz"),
         "vocabulary_size":
         10,
         "frequencies_size":
         20,
         "vocabulary_filename":
         "vocabulary.csv",
         "frequencies_filename":
         "frequencies.csv",
     }
     data = prepare_data(params)
     vocabulary = read_vocabulary(
         os.path.join(temp_dir, params["vocabulary_filename"]))
     self.assertEqual(len(vocabulary), params["vocabulary_size"])
     self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary)))
     frequencies = read_frequencies(
         os.path.join(temp_dir, params["frequencies_filename"]))
     self.assertEqual(len(frequencies), params["frequencies_size"])
     self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
     self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
     shutil.rmtree(temp_dir)
Example #4
0
    def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str,
                  neighbors: int = DEFAULT_NEIGHBORS_NUMBER,
                  edit_candidates: int = DEFAULT_EDIT_DISTANCE,
                  max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS,
                  max_corrected_length: int = 12) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \
                                First token in every line split is added to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must be two \
                                 values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param neighbors: Number of neighbors of context and typo embeddings \
                          to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup for candidates.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :param max_corrected_length: Maximum length of prefix in which symspell lookup \
                                     for typos is conducted
        """
        self.checker = SymSpell(max_dictionary_edit_distance=max_distance,
                                prefix_length=max_corrected_length)
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.neighbors_number = neighbors
        self.edit_candidates_number = edit_candidates
        self.max_distance = max_distance
        self.radius = radius
        self.tokens = read_vocabulary(vocabulary_file)
        self.frequencies = read_frequencies(frequencies_file)
Example #5
0
 def test_prepare_data_from_file(self):
     with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"),
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary)))
         frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
Example #6
0
    def construct(self,
                  vocabulary_file: str,
                  frequencies_file: str,
                  embeddings_file: str,
                  config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction \
                                candidates. First token in every line split is added \
                                to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must \
                                 be two values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens on \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.set_config(config)
        self.checker = SymSpell(
            max_dictionary_edit_distance=self.config["max_distance"],
            prefix_length=self.config["max_corrected_length"])
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.tokens = set(read_vocabulary(vocabulary_file))
        self.frequencies = read_frequencies(frequencies_file)
        self.min_freq = min(self.frequencies.values())
Example #7
0
def generate_vocabulary(frequencies_path: str, config: Mapping[str, Any]) -> Dict[str, int]:
    """
    Compose vocabulary from a set of tokens with known frequencies.

    Filtering of the input tokens depends on their frequencies and edit distances between them.
    All found English words and tokens that the algorithm considers word-like are added \
    regardless of their frequencies.
    :param frequencies_path: Path to the .csv file with space-separated word-frequency pairs \
                             one-per-line.
    :param config: Configuration for the vocabulary creation:
                   stable: How many tokens, which don't have more frequent \
                           edit-distance-neighbors, to take into the vocabulary.
                   suspicious: How many tokens, whose more frequent edit-distance-neighbor is
                               an English word, to take into the vocabulary.
                   non_suspicious: How many tokens, whose more frequent edit-distance-neighbor \
                                   is not an English word, to take into the vocabulary.
    :return: Dictionary with the vocabulary tokens as keys and their corresponding \
             frequencies as values.
    """
    checker = SymSpell(max_dictionary_edit_distance=2, prefix_length=100)
    checker.load_dictionary(frequencies_path)
    frequencies = read_frequencies(frequencies_path)
    sorted_frequencies = sorted(frequencies.items(), key=lambda x: -x[1])

    # For every token, find a token on edit distance 1, which has higher frequency, if there is one
    def _correct_token(token_freq):
        token, freq = token_freq
        suggestions = checker.lookup(token, 2, 1)
        if len(suggestions) > 1:
            correction = suggestions[1].term
            return correction, frequencies[correction]
        return token, freq
    corrections = list(tqdm(map(_correct_token, sorted_frequencies),
                            total=len(sorted_frequencies)))

    all_tokens = pandas.DataFrame(columns=["token", "token_freq", "correction", "correction_freq"])
    all_tokens["token"] = [token for token, _ in sorted_frequencies]
    all_tokens["token_freq"] = [freq for _, freq in sorted_frequencies]
    all_tokens["correction"] = [token_freq[0] if token_freq[1] > sorted_frequencies[i][1]
                                else sorted_frequencies[i][0]
                                for i, token_freq in enumerate(corrections)]
    all_tokens["correction_freq"] = [token_freq[1] if token_freq[1] > sorted_frequencies[i][1]
                                     else sorted_frequencies[i][1]
                                     for i, token_freq in enumerate(corrections)]
    all_tokens["rel"] = all_tokens["correction_freq"] / all_tokens["token_freq"]

    # Find all English words among all the tokens
    eng_voc = set()
    with smart_open(str(pathlib.Path(__file__).parent / "words_alpha.txt.xz"), "r") as f:
        for line in f:
            eng_voc.add(line.strip())

    # Leave only non-english tokens for analysis
    stable = all_tokens[(all_tokens.rel == 1.0) & ~all_tokens.token.isin(eng_voc)]
    unstable = all_tokens[(all_tokens.rel > 1) & ~all_tokens.token.isin(eng_voc)]

    # Get tokens and their corrections lemmas
    spacy.cli.download("en")
    nlp = spacy.load("en", disable=["parser", "ner"])

    def _lemmatize(token):
        lemm = nlp(token)
        if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or (token[-2:] == "ss" and
                                                           lemm[0].lemma_ == token[:-1]):
            return token
        return lemm[0].lemma_
    token_lemma = list(tqdm(map(_lemmatize, list(unstable.token)), total=len(unstable)))
    correction_lemma = list(tqdm(map(_lemmatize, list(unstable.correction)), total=len(unstable)))
    unstable["token_lemma"] = token_lemma
    unstable["cor_lemma"] = correction_lemma

    # Equal lemmas -> different forms of a morphologically changing token -> token is a "word"
    # Use some heuristics to remove noise
    eq_lemmas = unstable[
        (unstable["token_lemma"] == unstable["cor_lemma"]) |
        (unstable["token_lemma"] == unstable["correction"]) &
        (~unstable["correction"].isin(eng_voc) |
         (unstable["correction"].apply(lambda x: x[-3:]) == "ing"))]
    dif_lemmas = unstable[(unstable["token_lemma"] != unstable["cor_lemma"]) &
                          (unstable["token_lemma"] != unstable["correction"])]

    # Stemming heuristics
    def _norm(word: str) -> str:
        if word[-2:] == "ed" or word[-2:] == "er" or word[-1] == "s" and word[-2] != "s":
            return word[:-1]
        return word
    norm_eq = dif_lemmas[(dif_lemmas.token.apply(_norm) == dif_lemmas.correction)]

    # Gather all results
    good = all_tokens[all_tokens.token.isin(set(
        list(eq_lemmas[:].token) + list(eq_lemmas[:].correction) +
        list(norm_eq.token) + list(norm_eq.correction)))]
    unstable = unstable[~unstable.token.isin(good.token)]
    stable = stable[~stable.token.isin(good.token)]

    # Suspicious - have high probability to be typo-ed English words
    suspicious = unstable[unstable.correction.isin(eng_voc)]
    non_suspicious = unstable[~unstable.correction.isin(eng_voc)]
    vocabulary = all_tokens[all_tokens.token.isin(set(
        list(stable[:config["stable"]].token) +
        list(suspicious[:config["suspicious"]].token) +
        list(non_suspicious[:config["non_suspicious"]].token) +
        list(eng_voc) +
        list(good.token)))]
    return {token: freq for token, freq in vocabulary[["token", "token_freq"]].values}