class TfidfTransformersSVM:
    def _init_model(self):
        self.tokenizer = TransformersTokenizer()
        self.model = Pipeline([
            (
                "tfidf",
                TfidfVectorizer(
                    stop_words="english",
                    max_df=0.95,
                    min_df=0.0,
                    ngram_range=(1, 1),
                    tokenizer=self.tokenizer.tokenize,
                ),
            ),
            ("svm", OneVsRestClassifier(SVC(kernel="linear",
                                            probability=True))),
        ])

    def set_params(self, **params):
        if not hasattr(self, "model"):
            self._init_model()

        # TODO: Pass params to TransformersTokenizer
        self.model.set_params(**params)

    def fit(self, X, Y):
        if not hasattr(self, "model"):
            self.model = self._init_model()

        self.tokenizer.fit(X)
        self.model.fit(X, Y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)
Ejemplo n.º 2
0
class KerasVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 vocab_size=None,
                 sequence_length=None,
                 oov_token="<OOV>",
                 tokenizer_library="keras"):
        self.vocab_size = vocab_size
        self.oov_token = oov_token
        self.sequence_length = sequence_length
        self.tokenizer_library = tokenizer_library

    def _infer_from_data(self, X, load_buffer=1000):
        # We could look at a sample for more efficient
        max_sequence_length = 1

        def update_max_sequence_length(X_buffer, max_sequence_length,
                                       load_buffer):
            X_tokens = self.tokenizer.encode(X_buffer)
            sequence_length = max([len(x) for x in X_tokens])
            if sequence_length >= max_sequence_length:
                max_sequence_length = sequence_length
            return max_sequence_length

        X_buffer = []
        for x in X:
            X_buffer.append(x)

            if len(X_buffer) >= load_buffer:
                max_sequence_length = update_max_sequence_length(
                    X_buffer, max_sequence_length, load_buffer)
                X_buffer = []

        if X_buffer:
            max_sequence_length = update_max_sequence_length(
                X_buffer, max_sequence_length, load_buffer)

        self.sequence_length = max_sequence_length

    def fit(self, X, *_):
        if self.tokenizer_library == "keras":
            self.tokenizer = KerasTokenizer(vocab_size=self.vocab_size,
                                            oov_token=self.oov_token)
        elif self.tokenizer_library == "transformers":
            if self.vocab_size is None:
                self.tokenizer = TransformersTokenizer()
            else:
                self.tokenizer = TransformersTokenizer(
                    vocab_size=self.vocab_size)
        self.tokenizer.fit(X)
        if not self.sequence_length:
            logger.info(
                "Param sequence length not provided. Inferring from data.\
                This might take a while...")
            self._infer_from_data(X)
        return self

    def transform(self, X, *_):
        sequences = self.tokenizer.encode(X)
        return pad_sequences(sequences, maxlen=self.sequence_length)

    def build_embedding_matrix(self, embeddings_name_or_path=None):
        """
        Builds an embedding matrix from either a local embeddings path
        or a gensim pre-trained word vector path

        Args:
            embeddings_name_or_path:
                Can be either:
                - A local directory to word embeddings
                - The name of a GenSim pre-trained word vector model
                    e.g. 'glove-twitter-25', for the complete list:
                    https://github.com/RaRe-Technologies/gensim-data#models

        Returns:
            An embedding matrix

        """
        local_embeddings = False
        if path.isfile(embeddings_name_or_path):
            try:
                embeddings_index = {}
                with open(embeddings_name_or_path) as f:
                    for line in f:
                        word, coefs = line.split(maxsplit=1)
                        coefs = np.fromstring(coefs, "f", sep=" ")
                        embeddings_index[word] = coefs
                    emb_dim = len(coefs)
                local_embeddings = True
            except TypeError:
                raise TypeError("Incorrect local embeddings path")
        elif embeddings_name_or_path:
            try:
                embeddings_index = api.load(embeddings_name_or_path)
                emb_dim = embeddings_index.vector_size
            except ValueError:
                raise ValueError(
                    "Incorrect GenSim word vector model name, try e.g. 'glove-twitter-25'"
                )
        else:
            raise TypeError("No local or GenSim word embeddings given")

        num_words = len(self.tokenizer.vocab) + 1

        embedding_matrix = np.zeros((num_words, emb_dim))
        for word, i in self.tokenizer.vocab.items():
            if local_embeddings:
                embedding_vector = embeddings_index.get(word)
            else:
                # get_vector will error if the word isn't in the vocab
                try:
                    embedding_vector = embeddings_index.get_vector(word)
                except KeyError:
                    embedding_vector = None
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
Ejemplo n.º 3
0
def test_lowercase():
    tokenizer = TransformersTokenizer(lowercase=False)
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert tokens[0] == "This"
Ejemplo n.º 4
0
def test_bpe_model():
    tokenizer = TransformersTokenizer(model="bpe")
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert len(tokens) == 4
Ejemplo n.º 5
0
def tokenizer():
    tokenizer = TransformersTokenizer()
    tokenizer.fit(texts)
    return tokenizer
Ejemplo n.º 6
0
def test_vocab_size():
    tokenizer = TransformersTokenizer(vocab_size=30)
    tokenizer.fit(texts)
    vocab = tokenizer.vocab
    print(vocab)
    assert len(vocab) == 30