def test_save(tokenizer, tmp_path): tokenizer.save(tmp_path) loaded_tokenizer = TransformersTokenizer() loaded_tokenizer.load(tmp_path) tokens = loaded_tokenizer.tokenize("This is a test") assert len(tokens) == 4
def _init_model(self): self.tokenizer = TransformersTokenizer() self.model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), tokenizer=self.tokenizer.tokenize, ), ), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ])
def fit(self, X, *_): if self.tokenizer_library == "keras": self.tokenizer = KerasTokenizer(vocab_size=self.vocab_size, oov_token=self.oov_token) elif self.tokenizer_library == "transformers": if self.vocab_size is None: self.tokenizer = TransformersTokenizer() else: self.tokenizer = TransformersTokenizer( vocab_size=self.vocab_size) self.tokenizer.fit(X) if not self.sequence_length: logger.info( "Param sequence length not provided. Inferring from data.\ This might take a while...") self._infer_from_data(X) return self
class TfidfTransformersSVM: def _init_model(self): self.tokenizer = TransformersTokenizer() self.model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), tokenizer=self.tokenizer.tokenize, ), ), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) def set_params(self, **params): if not hasattr(self, "model"): self._init_model() # TODO: Pass params to TransformersTokenizer self.model.set_params(**params) def fit(self, X, Y): if not hasattr(self, "model"): self.model = self._init_model() self.tokenizer.fit(X) self.model.fit(X, Y) return self def predict(self, X): return self.model.predict(X) def predict_proba(self, X): return self.model.predict_proba(X)
class KerasVectorizer(BaseEstimator, TransformerMixin): def __init__(self, vocab_size=None, sequence_length=None, oov_token="<OOV>", tokenizer_library="keras"): self.vocab_size = vocab_size self.oov_token = oov_token self.sequence_length = sequence_length self.tokenizer_library = tokenizer_library def _infer_from_data(self, X, load_buffer=1000): # We could look at a sample for more efficient max_sequence_length = 1 def update_max_sequence_length(X_buffer, max_sequence_length, load_buffer): X_tokens = self.tokenizer.encode(X_buffer) sequence_length = max([len(x) for x in X_tokens]) if sequence_length >= max_sequence_length: max_sequence_length = sequence_length return max_sequence_length X_buffer = [] for x in X: X_buffer.append(x) if len(X_buffer) >= load_buffer: max_sequence_length = update_max_sequence_length( X_buffer, max_sequence_length, load_buffer) X_buffer = [] if X_buffer: max_sequence_length = update_max_sequence_length( X_buffer, max_sequence_length, load_buffer) self.sequence_length = max_sequence_length def fit(self, X, *_): if self.tokenizer_library == "keras": self.tokenizer = KerasTokenizer(vocab_size=self.vocab_size, oov_token=self.oov_token) elif self.tokenizer_library == "transformers": if self.vocab_size is None: self.tokenizer = TransformersTokenizer() else: self.tokenizer = TransformersTokenizer( vocab_size=self.vocab_size) self.tokenizer.fit(X) if not self.sequence_length: logger.info( "Param sequence length not provided. Inferring from data.\ This might take a while...") self._infer_from_data(X) return self def transform(self, X, *_): sequences = self.tokenizer.encode(X) return pad_sequences(sequences, maxlen=self.sequence_length) def build_embedding_matrix(self, embeddings_name_or_path=None): """ Builds an embedding matrix from either a local embeddings path or a gensim pre-trained word vector path Args: embeddings_name_or_path: Can be either: - A local directory to word embeddings - The name of a GenSim pre-trained word vector model e.g. 'glove-twitter-25', for the complete list: https://github.com/RaRe-Technologies/gensim-data#models Returns: An embedding matrix """ local_embeddings = False if path.isfile(embeddings_name_or_path): try: embeddings_index = {} with open(embeddings_name_or_path) as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, "f", sep=" ") embeddings_index[word] = coefs emb_dim = len(coefs) local_embeddings = True except TypeError: raise TypeError("Incorrect local embeddings path") elif embeddings_name_or_path: try: embeddings_index = api.load(embeddings_name_or_path) emb_dim = embeddings_index.vector_size except ValueError: raise ValueError( "Incorrect GenSim word vector model name, try e.g. 'glove-twitter-25'" ) else: raise TypeError("No local or GenSim word embeddings given") num_words = len(self.tokenizer.vocab) + 1 embedding_matrix = np.zeros((num_words, emb_dim)) for word, i in self.tokenizer.vocab.items(): if local_embeddings: embedding_vector = embeddings_index.get(word) else: # get_vector will error if the word isn't in the vocab try: embedding_vector = embeddings_index.get_vector(word) except KeyError: embedding_vector = None if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return embedding_matrix
def test_lowercase(): tokenizer = TransformersTokenizer(lowercase=False) tokenizer.fit(texts) tokens = tokenizer.tokenize("This is a test") assert tokens[0] == "This"
def test_bpe_model(): tokenizer = TransformersTokenizer(model="bpe") tokenizer.fit(texts) tokens = tokenizer.tokenize("This is a test") assert len(tokens) == 4
def tokenizer(): tokenizer = TransformersTokenizer() tokenizer.fit(texts) return tokenizer
def test_vocab_size(): tokenizer = TransformersTokenizer(vocab_size=30) tokenizer.fit(texts) vocab = tokenizer.vocab print(vocab) assert len(vocab) == 30