def test_train(self): def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"): token_pattern = re.compile(token_pattern) return token_pattern.findall(text) train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) train_x = train_data.data train_x_tokens = [_tokenizer(x) for x in train_x] tfidf_model = nmw.TfIdf() tfidf_model.train(train_x_tokens) tfidf_model.save('.') aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer) texts = [ 'The quick brown fox jumps over the lazy dog', 'asdasd test apple dog asd asd' ] for text in texts: augmented_text = aug.augment(text) self.assertNotEqual(text, augmented_text) self.assertLess(0, len(texts))
def init_tfidf_model(model_path, force_reload=False): # Load model once at runtime global TFIDF_MODEL if TFIDF_MODEL and not force_reload: return TFIDF_MODEL tfidf_model = nmws.TfIdf() tfidf_model.read(model_path) TFIDF_MODEL = tfidf_model return tfidf_model
def _train_tfidf(self): import sklearn.datasets import re import nlpaug.model.word_stats as nmw def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"): token_pattern = re.compile(token_pattern) return token_pattern.findall(text) # Load sample data train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) train_x = train_data.data # Tokenize input train_x_tokens = [_tokenizer(x) for x in train_x] # Train TF-IDF model if not os.path.exists(self.tfidf_model_path): os.makedirs(self.tfidf_model_path) tfidf_model = nmw.TfIdf() tfidf_model.train(train_x_tokens) tfidf_model.save(self.tfidf_model_path)
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"): token_pattern = re.compile(token_pattern) return token_pattern.findall(text) # Load sample data train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) train_x = train_data.data # Tokenize input train_x_tokens = [_tokenizer(x) for x in train_x] # Train TF-IDF model tfidf_model = nmw.TfIdf() tfidf_model.train(train_x_tokens) tfidf_model.save('.') # Load TF-IDF augmenter aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer) texts = [ 'The quick brown fox jumps over the lazy dog', 'asdasd test apple dog asd asd' ] for text in texts: augmented_text = aug.augment(text) print('-' * 20)