def _init_vectorizer(self, language):
        from sklearn.feature_extraction.text import (TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        self._tfidf_vectorizer = SklearnTfidfVectorizer(
            tokenizer=lambda x: tokenize_light(x, language))
        return self
    def from_path(cls, path, **shared):
        import numpy as np
        import scipy.sparse as sp
        from sklearn.feature_extraction.text import (TfidfTransformer,
                                                     TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        path = Path(path)

        model_path = path / "vectorizer.json"
        if not model_path.exists():
            raise LoadingError("Missing vectorizer model file: %s" %
                               model_path.name)
        with model_path.open("r", encoding="utf-8") as f:
            vectorizer_dict = json.load(f)

        vectorizer = cls(vectorizer_dict["config"], **shared)
        vectorizer._language = vectorizer_dict["language_code"]

        builtin_entity_scope = vectorizer_dict["builtin_entity_scope"]
        if builtin_entity_scope is not None:
            builtin_entity_scope = set(builtin_entity_scope)
        vectorizer.builtin_entity_scope = builtin_entity_scope

        vectorizer_ = vectorizer_dict["vectorizer"]
        if vectorizer_:
            vocab = vectorizer_["vocab"]
            idf_diag_data = vectorizer_["idf_diag"]
            idf_diag_data = np.array(idf_diag_data)

            idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
            row = list(range(idf_diag_shape[0]))
            col = list(range(idf_diag_shape[0]))
            idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                     shape=idf_diag_shape)

            tfidf_transformer = TfidfTransformer()
            tfidf_transformer._idf_diag = idf_diag

            vectorizer_ = SklearnTfidfVectorizer(
                tokenizer=lambda x: tokenize_light(x, vectorizer._language))
            vectorizer_.vocabulary_ = vocab

            vectorizer_._tfidf = tfidf_transformer

        vectorizer._tfidf_vectorizer = vectorizer_
        return vectorizer
Example #3
0
 def _init_vectorizer(self, language):
     self._tfidf_vectorizer = SklearnTfidfVectorizer(
         tokenizer=lambda x: tokenize_light(x, language))
     return self
Example #4
0
 def __init__(self, tfidf: SklearnTfidfVectorizer = None, **tfidf_params):
     if tfidf is None:
         self.tfidf = SklearnTfidfVectorizer(**tfidf_params)
     else:
         self.tfidf = tfidf
Example #5
0
 def __init__(self, **kwargs):
     self.encoder = SklearnTfidfVectorizer(**kwargs)
Example #6
0
 def __init__(self, line):
     self.__vectorizer = SklearnTfidfVectorizer()
     self.__vectorizer.fit(line)