Python CandidatesRanker.constructの例

プログラミング言語: Python

名前空間/パッケージ名: lookout.style.typos.ranking

クラス/型: CandidatesRanker

メソッド/関数: construct

hotexamples.comのコード掲載数: 2

Python CandidatesRanker.construct - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのlookout.style.typos.ranking.CandidatesRanker.constructの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

fit(11)

CandidatesRanker(10)

rank(7)

_generate_tree(3)

_load_tree(3)

dump(3)

construct(2)

save(2)

set_config(1)

コード例 #1

ファイルを表示

ファイル: corrector.py プロジェクト: warenlg/style-analyzer

class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that suggests fixes to correct typos."
    LICENSE = DEFAULT_LICENSE
    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {"max_depth": 6,
                           "eta": 0.03,
                           "min_child_weight": 2,
                           "silent": 1,
                           "objective": "binary:logistic",
                           "subsample": 0.5,
                           "colsample_bytree": 0.5,
                           "alpha": 1,
                           "eval_metric": ["error"],
                           "nthread": 0}

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self) -> int:
        """Return the number of threads for multiprocessing used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    @threads_number.setter
    def threads_number(self, threads_number: int):
        """Set the number of threads for multiprocessing used to train and to predict."""
        self.ranker.boost_param["nthread"] = threads_number

    def initialize_ranker(self, boost_params: Optional[dict] = None,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(boost_params, train_rounds, early_stopping)

    def initialize_generator(self, vocabulary_file: str, frequencies_file: str,
                             embeddings_file: Optional[str] = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path to the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        """
        self.generator.construct(vocabulary_file, frequencies_file, embeddings_file,
                                 neighbors_number, edit_candidates, max_distance, radius)

    def train(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self, data_file: str, candidates:  Optional[str] = None,
                      save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given file.

        :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \
                          Columns.CorrectToken and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self.train(pandas.read_csv(data_file, index_col=0), candidates, save_candidates_file)

    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)

    def suggest_on_file(self, data_file: str, candidates:  Optional[str] = None,
                        save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                        return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given file.

        :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \
                          and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        return self.suggest(pandas.read_csv(data_file, index_col=0), candidates,
                            save_candidates_file, n_candidates, return_all)

    def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3,
                           return_all: bool = True, batch_size: int = 2048,
                           ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset by batches. \
        Does not support precalculated candidates.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param batch_size: Batch size.

        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(data), batch_size)):
            suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates,
                                       return_all=return_all)
            all_suggestions.append(suggestions.items())
        return dict(chain.from_iterable(all_suggestions))

    def evaluate(self, test_data: pandas.DataFrame) -> None:
        """
        Evaluate the corrector on the given test dataset.

        Save the result metrics to the model metadata and print it to the standard output.
        :param test_data: DataFrame which contains column Columns.Token, \
                          column Columns.Split is optional, but used when present
        """
        suggestions = self.suggest(test_data)
        self.metrics = get_scores(test_data, suggestions)
        print(generate_report(test_data, suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" %
                (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {"generator": self.generator._generate_tree(),
                "ranker": self.ranker._generate_tree()}

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])

コード例 #2

ファイルを表示

class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"

    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {
        "max_depth": 6,
        "eta": 0.03,
        "min_child_weight": 2,
        "silent": 1,
        "objective": "binary:logistic",
        "subsample": 0.5,
        "colsample_bytree": 0.5,
        "alpha": 1,
        "eval_metric": ["error"],
        "nthread": 0
    }

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self):
        """Return the number of threads used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    def initialize_ranker(self,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING,
                          boost_params: dict = None) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(train_rounds, early_stopping, boost_params)

    def initialize_generator(self,
                             vocabulary_file: str,
                             frequencies_file: str,
                             embeddings_file: str = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path ot the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :return: Nothing
        """
        self.generator.construct(vocabulary_file, frequencies_file,
                                 embeddings_file, neighbors_number,
                                 edit_candidates, max_distance, radius)

    def train(self,
              typos: pandas.DataFrame,
              candidates: pandas.DataFrame = None,
              save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos: DataFrame containing columns "typo" and "identifier",
                      column "token_split" is optional, but used when present.
        :param candidates: DataFrame with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        self.ranker.fit(typos[CORRECT_TOKEN_COLUMN],
                        get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self,
                      typos_file: str,
                      candidates_file: str = None,
                      save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos_file: CSV file with columns "typo" and "identifier",
                           column "token_split" is optional, but used when present.
        :param candidates_file: Pickle dump of pandas.DataFrame with precalculated \
                                candidates and features
        :param save_candidates_file: Path to file where to save the candidates.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        self.train(typos, candidates, save_candidates_file)

    def suggest(self,
                typos: pandas.DataFrame,
                candidates: pandas.DataFrame = None,
                save_candidates_file: str = None,
                n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos: DataFrame containing column "typo", \
                      column "token_split" is optional, but used when present
        :param candidates: DataFrame with precalculated candidates
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates),
                                n_candidates, return_all)

    def suggest_file(
            self,
            typos_file: str,
            candidates_file: str = None,
            save_candidates_file: str = None,
            n_candidates: int = 3,
            return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos_file: csv file containing DataFrame with column "typo", \
                           column "token_split" is optional, but used when present
        :param candidates_file: pickle file containing DataFrame with precalculated \
                                candidates and features
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        return self.suggest(typos, candidates, save_candidates_file,
                            n_candidates, return_all)

    def suggest_by_batches(
        self,
        typos: pandas.DataFrame,
        n_candidates: int = None,
        return_all: bool = True,
        batch_size: int = 2048,
    ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Correct typos from dataset by batches. Does not support precalculated candidates.

        Suggest corrections for given typos
        :param typos: DataFrame containing column "typo", \
               column "token_split" is optional, but used when present
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param batch_size: Batch size
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(typos), batch_size)):
            suggestions = self.suggest(
                typos.loc[typos.index[i]:typos.
                          index[min(len(typos) - 1, i + batch_size - 1)], :],
                n_candidates=n_candidates,
                return_all=return_all)
            all_suggestions.append(suggestions.items())

        return dict(chain.from_iterable(all_suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" % (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {
            "generator": self.generator._generate_tree(),
            "ranker": self.ranker._generate_tree()
        }

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])