Example #1
0
 def test_prepare_data_with_load(self):
     with tempfile.TemporaryDirectory(
             prefix="lookout_typos_prepare_load_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "dataset_url": "https://docs.google.com/uc?export=download&"
             "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo",
             "input_path": None,
             "raw_data_filename": "raw_test_data.csv.xz",
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(
             os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token])))
         frequencies = read_frequencies(
             os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token,
                          Columns.Split}.issubset(data.columns))
Example #2
0
 def test_read_functions(self):
     vocabulary = read_vocabulary(
         join(TEST_DATA_PATH, "test_frequencies.csv.xz"))
     frequencies = read_frequencies(
         join(TEST_DATA_PATH, "test_frequencies.csv.xz"))
     self.assertEqual(len(vocabulary), 100)
     self.assertSetEqual(set(vocabulary), set(frequencies.keys()))
 def test_prepare_data_from_file(self):
     temp_dir = mkdtemp()
     params = {
         "data_dir":
         temp_dir,
         "input_path":
         str(pathlib.Path(__file__).parent / "raw_test_data.csv.xz"),
         "vocabulary_size":
         10,
         "frequencies_size":
         20,
         "vocabulary_filename":
         "vocabulary.csv",
         "frequencies_filename":
         "frequencies.csv",
     }
     data = prepare_data(params)
     vocabulary = read_vocabulary(
         os.path.join(temp_dir, params["vocabulary_filename"]))
     self.assertEqual(len(vocabulary), params["vocabulary_size"])
     self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary)))
     frequencies = read_frequencies(
         os.path.join(temp_dir, params["frequencies_filename"]))
     self.assertEqual(len(frequencies), params["frequencies_size"])
     self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
     self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
     shutil.rmtree(temp_dir)
Example #4
0
    def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str,
                  neighbors: int = DEFAULT_NEIGHBORS_NUMBER,
                  edit_candidates: int = DEFAULT_EDIT_DISTANCE,
                  max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS,
                  max_corrected_length: int = 12) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \
                                First token in every line split is added to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must be two \
                                 values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param neighbors: Number of neighbors of context and typo embeddings \
                          to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup for candidates.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :param max_corrected_length: Maximum length of prefix in which symspell lookup \
                                     for typos is conducted
        """
        self.checker = SymSpell(max_dictionary_edit_distance=max_distance,
                                prefix_length=max_corrected_length)
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.neighbors_number = neighbors
        self.edit_candidates_number = edit_candidates
        self.max_distance = max_distance
        self.radius = radius
        self.tokens = read_vocabulary(vocabulary_file)
        self.frequencies = read_frequencies(frequencies_file)
Example #5
0
 def test_prepare_data_from_file(self):
     with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"),
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary)))
         frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
Example #6
0
    def construct(self,
                  vocabulary_file: str,
                  frequencies_file: str,
                  embeddings_file: str,
                  config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Construct correction candidates generator.

        :param vocabulary_file: Text file used to generate vocabulary of correction \
                                candidates. First token in every line split is added \
                                to the vocabulary.
        :param frequencies_file: Path to the text file with frequencies. Each line must \
                                 be two values separated with a whitespace: "token count".
        :param embeddings_file: Path to the dump of FastText model.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens on \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.set_config(config)
        self.checker = SymSpell(
            max_dictionary_edit_distance=self.config["max_distance"],
            prefix_length=self.config["max_corrected_length"])
        self.checker.load_dictionary(vocabulary_file)
        self.wv = FastText.load_fasttext_format(embeddings_file).wv
        self.tokens = set(read_vocabulary(vocabulary_file))
        self.frequencies = read_frequencies(frequencies_file)
        self.min_freq = min(self.frequencies.values())