def test_explain(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) explanations = model.explain(valid_sample.Text) normal_predictions = model.predict(valid_sample.Text) explanation_preds = [e["prediction"] for e in explanations] # check that the process of turning on explain does not change the preds self.assertEqual(explanation_preds, list(normal_predictions)) self.assertEqual(len(explanation_preds), len(train_sample.Text)) self.assertEqual(type(explanations[0]["token_ends"]), list) self.assertEqual(type(explanations[0]["token_starts"]), list) self.assertEqual(type(explanations[0]["explanation"]), dict) self.assertEqual( len(explanations[0]["token_starts"]), len(explanations[0]["explanation"][0]) ) self.assertEqual( len(explanations[0]["token_ends"]), len(explanations[0]["explanation"][0]) )
def test_reasonable_predictions(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(**self.default_config(n_epochs=5)) n_duplicates = 5 trX = ( ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates + ["finance", "investment", "investing", "dividends", "financial"] * n_duplicates ) trY = ( ['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2) ) teX = ["furball", "fiduciary"] teY = ["cat"] + ["finance"] model.fit(trX, trY) predY = model.predict(teX) print(predY) self.assertEqual(accuracy_score(teY, predY), 1.00)
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions of the right type """ model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) with self.assertRaises(FinetuneError): model.fit(train_sample.Text, train_sample.Target[:1]) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict(valid_sample.Text.values) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text.values) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict_proba(valid_sample.Text[:1].values) predictions2 = model.predict_proba(valid_sample.Text[1:2].values) with model.cached_predict(): np.testing.assert_allclose( list( model.predict_proba( valid_sample.Text[:1].values)[0].values()), list(predictions[0].values()), rtol=1e-4, ) np.testing.assert_allclose( list( model.predict_proba( valid_sample.Text[1:2].values)[0].values()), list(predictions2[0].values()), rtol=1e-4, )
def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Test with different sizes to make sure we handle cases where # the data doesn't divide evenly into batches half_sample = int(self.n_sample / 2) quarter_sample = int(half_sample / 2) model.fit(train_sample.Text.values, train_sample.Target.values) # Predictions w/o cached predict preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]), ] # Predictions w/ cached predict with model.cached_predict(): cached_preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]), ] for batch_preds, batch_cached_preds in zip(preds, cached_preds): for pred, cached_pred in zip(batch_preds, batch_cached_preds): assert list(pred.keys()) == list(cached_pred.keys()) for pred_val, cached_pred_val in zip(pred.values(), cached_pred.values()): np.testing.assert_almost_equal(pred_val, cached_pred_val, decimal=4)
def test_class_weights(self): # testing class weights train_sample = self.dataset.sample(n=self.n_sample * 3) valid_sample = self.dataset.sample(n=self.n_sample * 3) model = Classifier(**self.default_config()) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict(valid_sample.Text.values) recall = recall_score(valid_sample.Target.values, predictions, pos_label=1) model = Classifier(**self.default_config(class_weights={1: 100})) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict(valid_sample.Text.values) new_recall = recall_score(valid_sample.Target.values, predictions, pos_label=1) self.assertTrue(new_recall >= recall) # test auto-inferred class weights function model = Classifier(**self.default_config(class_weights='log')) model.fit(train_sample.Text.values, train_sample.Target.values)
print("Starting training") start = time.time() model = Classifier( max_length=512, val_interval=1000, n_epochs=3, l2_reg=0.0, lr=6.25E-05, lm_loss_coef=0.25, # eval_acc = True, # doesn't work # oversample = True, # oversamples too much, so I am doing it separately params_device=0, autosave_path="/W210_Gov_Complaints_Portal/models/", verbose=True, ) model.fit(trainX_res_list, trainY_res_list) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117" ) # Serialize the model to disk print("Model Saved") print("Starting testing") # model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start
def download(self): """ Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))
"""GPT2imdb.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1_484wco-2YnrTKVJr5wN4qW4RDQIuKZD """ import pandas as pd import finetune url = 'https://raw.githubusercontent.com/BillGu19/Bass/master/name_genre_identifiers.csv' name_genre = pd.read_csv(url) name = name_genre['primaryName'] genre = name_genre['top genre'] #print(name) #print(genre) #print(name_genre) from finetune.base_models import BERT, BERTLarge, GPT2, GPT2Medium, GPT2Large, TextCNN, TCN, RoBERTa, DistilBERT from finetune import Classifier from finetune import LanguageModel #X = ['german shepherd', 'maine coon', 'persian', 'beagle'] #Y = ['dog', 'cat', 'cat', 'dog'] model = Classifier(base_model=GPT2) model.fit(name, genre) testX = ['Tom Cruise','Jamie Lee Curtis', 'Claire Danes', 'Geena Davis', 'Robert De Niro', 'John Denver', 'Johnny Depp', 'Leonardo DiCaprio', 'Clint Eastwood'] predictions= model.predict(testX) print(predictions)
DATA_PATH = Path('./data') MODELS_PATH = Path('./models') MODELS_PATH.mkdir(exist_ok=True) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--nrows', default=147618, type=int, help='Define number of posts to be used to perform unsupervised finetuning of language model, defaults to all posts available (147618)') parser.add_argument('--name', type=str, help='Name of model to be saved in ./models directory') parser.add_argument('--labeled', action='store_true', help='Use only labeled posts for finetuning') args = parser.parse_args() # read in data and select sample based on CLI args posts_df = pd.read_csv(DATA_PATH/'processed'/'all_posts_data.csv', usecols=['post_id', 'cleaned_body', 'label', 'predict_me']) if args.labeled: posts_sample = posts_df[(posts_df.label.notnull()) | posts_df.predict_me] else: posts_sample = posts_df.sample(n=args.nrows, random_state=42) texts = list(posts_sample.cleaned_body.astype(str)) print(f'{len(texts)} posts will be used to finetune the GPT language model') model = Classifier(batch_size=8) model.fit(texts) model.save(MODELS_PATH / args.name)
print(data3.shape) print(data3.loc[82480]) mask = (data3['description'].str.len() >= 20) & (data3['description'].str.len() <= 512) dataFiltered = data3.loc[mask] print(dataFiltered.shape) dataFiltered.columns[dataFiltered.isna().any()].tolist() # ourLabel doesn't have NaN values, so that is good. trainingData = dataFiltered[["description", "OurLabel"]] print(type(trainingData)) print(trainingData.shape) trainX, testX, trainY, testY = train_test_split(trainingData.description, trainingData.OurLabel, test_size=0.2, random_state=42) # bigMask = (trainingData["description"].str.len() >=1000) # print(trainingData.loc[bigMask].shape) # Split in train and test 80/20 print(trainX.shape) print(type(trainX)) print(trainY.shape) model = Classifier(max_length=512, val_interval=3000, verbose=True) # Load base model model.fit(trainX, trainY) # Finetune base model on custom data model.save("newModel") # Serialize the model to disk
print("Prepared a stratified sample.") trainX, testX, trainY, testY = train_test_split(sampleX, sampleY, test_size=0.2, random_state=42, stratify=sampleY) print(trainX.shape) print("Split into train and test") print("Starting training") print(trainX.shape) start = time.time() model = Classifier(max_length=512, val_interval=3000, verbose=True) # Load base model model.fit(trainX.tolist(), trainY.tolist()) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save("combined_model_20181018") # Serialize the model to disk print("Model Saved") # model = Classifier.load("../models/combined_model_20181018") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start print("Predictions done") print("It took :" + str(duration) + " seconds")
Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME ) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=200).dataframe pre_train_generator = lambda: iter(StanfordSentimentTreebank(nrows=5000).dataframe.Text.values) model = Classifier(n_epochs=3, batch_size=2, lr_warmup=0.1, tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text.values, dataset.Target.values, test_size=0.3, random_state=42) model.config.dataset_size = 5000 model.config.val_size = 100 model.config.val_interval = 1000 model.config.batch_size = 5 model.fit(pre_train_generator) model.config.val_size = None model.config.val_interval = None #model.config.dataset_size = 1000 # This is automatically set as trainX has len model.config.batch_size = 2 model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))
class StanfordSentimentTreebank(Dataset): def __init__(self, filename=None, **kwargs): super().__init__(filename=(filename or DATA_PATH), **kwargs) def md5(self): return CHECKSUM def download(self): """ Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME ) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))