Ejemplo n.º 1
0
def test_save(tokenizer, tmp_path):
    tokenizer.save(tmp_path)

    loaded_tokenizer = TransformersTokenizer()
    loaded_tokenizer.load(tmp_path)
    tokens = loaded_tokenizer.tokenize("This is a test")
    assert len(tokens) == 4
 def _init_model(self):
     self.tokenizer = TransformersTokenizer()
     self.model = Pipeline([
         (
             "tfidf",
             TfidfVectorizer(
                 stop_words="english",
                 max_df=0.95,
                 min_df=0.0,
                 ngram_range=(1, 1),
                 tokenizer=self.tokenizer.tokenize,
             ),
         ),
         ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                         probability=True))),
     ])
Ejemplo n.º 3
0
 def fit(self, X, *_):
     if self.tokenizer_library == "keras":
         self.tokenizer = KerasTokenizer(vocab_size=self.vocab_size,
                                         oov_token=self.oov_token)
     elif self.tokenizer_library == "transformers":
         if self.vocab_size is None:
             self.tokenizer = TransformersTokenizer()
         else:
             self.tokenizer = TransformersTokenizer(
                 vocab_size=self.vocab_size)
     self.tokenizer.fit(X)
     if not self.sequence_length:
         logger.info(
             "Param sequence length not provided. Inferring from data.\
             This might take a while...")
         self._infer_from_data(X)
     return self
class TfidfTransformersSVM:
    def _init_model(self):
        self.tokenizer = TransformersTokenizer()
        self.model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                    tokenizer=self.tokenizer.tokenize,
                ),
            ),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])

    def set_params(self, **params):
        if not hasattr(self, "model"):
            self._init_model()

        # TODO: Pass params to TransformersTokenizer
        self.model.set_params(**params)

    def fit(self, X, Y):
        if not hasattr(self, "model"):
            self.model = self._init_model()

        self.tokenizer.fit(X)
        self.model.fit(X, Y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)
Ejemplo n.º 5
0
class KerasVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 vocab_size=None,
                 sequence_length=None,
                 oov_token="<OOV>",
                 tokenizer_library="keras"):
        self.vocab_size = vocab_size
        self.oov_token = oov_token
        self.sequence_length = sequence_length
        self.tokenizer_library = tokenizer_library

    def _infer_from_data(self, X, load_buffer=1000):
        # We could look at a sample for more efficient
        max_sequence_length = 1

        def update_max_sequence_length(X_buffer, max_sequence_length,
                                       load_buffer):
            X_tokens = self.tokenizer.encode(X_buffer)
            sequence_length = max([len(x) for x in X_tokens])
            if sequence_length >= max_sequence_length:
                max_sequence_length = sequence_length
            return max_sequence_length

        X_buffer = []
        for x in X:
            X_buffer.append(x)

            if len(X_buffer) >= load_buffer:
                max_sequence_length = update_max_sequence_length(
                    X_buffer, max_sequence_length, load_buffer)
                X_buffer = []

        if X_buffer:
            max_sequence_length = update_max_sequence_length(
                X_buffer, max_sequence_length, load_buffer)

        self.sequence_length = max_sequence_length

    def fit(self, X, *_):
        if self.tokenizer_library == "keras":
            self.tokenizer = KerasTokenizer(vocab_size=self.vocab_size,
                                            oov_token=self.oov_token)
        elif self.tokenizer_library == "transformers":
            if self.vocab_size is None:
                self.tokenizer = TransformersTokenizer()
            else:
                self.tokenizer = TransformersTokenizer(
                    vocab_size=self.vocab_size)
        self.tokenizer.fit(X)
        if not self.sequence_length:
            logger.info(
                "Param sequence length not provided. Inferring from data.\
                This might take a while...")
            self._infer_from_data(X)
        return self

    def transform(self, X, *_):
        sequences = self.tokenizer.encode(X)
        return pad_sequences(sequences, maxlen=self.sequence_length)

    def build_embedding_matrix(self, embeddings_name_or_path=None):
        """
        Builds an embedding matrix from either a local embeddings path
        or a gensim pre-trained word vector path

        Args:
            embeddings_name_or_path:
                Can be either:
                - A local directory to word embeddings
                - The name of a GenSim pre-trained word vector model
                    e.g. 'glove-twitter-25', for the complete list:
                    https://github.com/RaRe-Technologies/gensim-data#models

        Returns:
            An embedding matrix

        """
        local_embeddings = False
        if path.isfile(embeddings_name_or_path):
            try:
                embeddings_index = {}
                with open(embeddings_name_or_path) as f:
                    for line in f:
                        word, coefs = line.split(maxsplit=1)
                        coefs = np.fromstring(coefs, "f", sep=" ")
                        embeddings_index[word] = coefs
                    emb_dim = len(coefs)
                local_embeddings = True
            except TypeError:
                raise TypeError("Incorrect local embeddings path")
        elif embeddings_name_or_path:
            try:
                embeddings_index = api.load(embeddings_name_or_path)
                emb_dim = embeddings_index.vector_size
            except ValueError:
                raise ValueError(
                    "Incorrect GenSim word vector model name, try e.g. 'glove-twitter-25'"
                )
        else:
            raise TypeError("No local or GenSim word embeddings given")

        num_words = len(self.tokenizer.vocab) + 1

        embedding_matrix = np.zeros((num_words, emb_dim))
        for word, i in self.tokenizer.vocab.items():
            if local_embeddings:
                embedding_vector = embeddings_index.get(word)
            else:
                # get_vector will error if the word isn't in the vocab
                try:
                    embedding_vector = embeddings_index.get_vector(word)
                except KeyError:
                    embedding_vector = None
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
Ejemplo n.º 6
0
def test_lowercase():
    tokenizer = TransformersTokenizer(lowercase=False)
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert tokens[0] == "This"
Ejemplo n.º 7
0
def test_bpe_model():
    tokenizer = TransformersTokenizer(model="bpe")
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert len(tokens) == 4
Ejemplo n.º 8
0
def tokenizer():
    tokenizer = TransformersTokenizer()
    tokenizer.fit(texts)
    return tokenizer
Ejemplo n.º 9
0
def test_vocab_size():
    tokenizer = TransformersTokenizer(vocab_size=30)
    tokenizer.fit(texts)
    vocab = tokenizer.vocab
    print(vocab)
    assert len(vocab) == 30