Esempio n. 1
0
 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3, return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))
Esempio n. 2
0
 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3,
                               return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))
Esempio n. 3
0
 def test_eq(self):
     self.assertTrue(CandidatesRanker() == CandidatesRanker())
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     self.assertFalse(ranker == CandidatesRanker())
Esempio n. 4
0
 def test_eq(self):
     self.assertTrue(CandidatesRanker() == CandidatesRanker())
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     self.assertFalse(ranker == CandidatesRanker())
Esempio n. 5
0
 def test_save_load(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     with io.BytesIO() as buffer:
         ranker.save(output=buffer, series="typos-analyzer")
         print(buffer.tell())
         buffer.seek(0)
         ranker2 = CandidatesRanker().load(buffer)
     print(ranker)
     self.assertTrue(ranker == ranker2)
Esempio n. 6
0
 def test_save_load(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     with io.BytesIO() as buffer:
         ranker.save(buffer)
         print(buffer.tell())
         buffer.seek(0)
         ranker2 = CandidatesRanker().load(buffer)
     print(ranker)
     self.assertTrue(ranker == ranker2)
Esempio n. 7
0
    def train(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))
Esempio n. 8
0
    def train(self,
              typos: pandas.DataFrame,
              candidates: pandas.DataFrame = None,
              save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos: DataFrame containing columns "typo" and "identifier",
                      column "token_split" is optional, but used when present.
        :param candidates: DataFrame with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        self.ranker.fit(typos[CORRECT_TOKEN_COLUMN],
                        get_candidates_metadata(candidates),
                        get_candidates_features(candidates))
Esempio n. 9
0
    def train(self, data: pandas.DataFrame, candidates: Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self._log.info("train input shape: %s", data.shape)
        if candidates is None:
            self._log.info("candidates were not provided and will be generated")
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
            self._log.info("loaded candidates from %s", candidates)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))
Esempio n. 10
0
    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)
Esempio n. 11
0
    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True, start_pool_size: int = DEFAULT_START_POOL_SIZE,
                chunksize: int = DEFAULT_CHUNKSIZE) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param start_pool_size: Length of data, starting from which multiprocessing is desired.
        :param chunksize: Max size of a chunk for one process during multiprocessing.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.processes_number, start_pool_size, chunksize, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)
Esempio n. 12
0
    def train(self, data: pandas.DataFrame, candidates: Optional[str] = None,
              save_candidates_file: Optional[str] = None,
              start_pool_size: int = DEFAULT_START_POOL_SIZE,
              chunksize: int = DEFAULT_CHUNKSIZE) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        :param start_pool_size: Length of data, starting from which multiprocessing is desired.
        :param chunksize: Max size of a chunk for one process during multiprocessing.
        """
        self._log.info("train input shape: %s", data.shape)
        if candidates is None:
            self._log.info("candidates were not provided and will be generated")
            candidates = self.generator.generate_candidates(
                data, self.processes_number, start_pool_size, chunksize, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
            self._log.info("loaded candidates from %s", candidates)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))
Esempio n. 13
0
    def suggest(self,
                typos: pandas.DataFrame,
                candidates: pandas.DataFrame = None,
                save_candidates_file: str = None,
                n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos: DataFrame containing column "typo", \
                      column "token_split" is optional, but used when present
        :param candidates: DataFrame with precalculated candidates
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates),
                                n_candidates, return_all)
Esempio n. 14
0
 def test_get_candidates_features(self):
     features = numpy.array([[0.1, 0.2],
                             [0.8, 0.3],
                             [0.1, 0.5]], dtype="float32")
     assert_array_equal(get_candidates_features(self.candidates), features)