Ejemplo n.º 1
0
    def test_explain(self):
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        explanations = model.explain(valid_sample.Text)
        normal_predictions = model.predict(valid_sample.Text)
        explanation_preds = [e["prediction"] for e in explanations]

        # check that the process of turning on explain does not change the preds
        self.assertEqual(explanation_preds, list(normal_predictions))
        self.assertEqual(len(explanation_preds), len(train_sample.Text))
        self.assertEqual(type(explanations[0]["token_ends"]), list)
        self.assertEqual(type(explanations[0]["token_starts"]), list)
        self.assertEqual(type(explanations[0]["explanation"]), dict)
        self.assertEqual(
            len(explanations[0]["token_starts"]), len(explanations[0]["explanation"][0])
        )
        self.assertEqual(
            len(explanations[0]["token_ends"]), len(explanations[0]["explanation"][0])
        )
Ejemplo n.º 2
0
    def test_reasonable_predictions(self):
        """
        Ensure model converges to a reasonable solution for a trivial problem
        """
        model = Classifier(**self.default_config(n_epochs=5))
        
        n_duplicates = 5

        trX = (
            ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates + 
            ["finance", "investment", "investing", "dividends", "financial"] * n_duplicates
        )
        trY = (
            ['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2)
        )
        teX = ["furball", "fiduciary"]
        teY = ["cat"] + ["finance"]
        model.fit(trX, trY)
        predY = model.predict(teX)
        print(predY)
        self.assertEqual(accuracy_score(teY, predY), 1.00)
Ejemplo n.º 3
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        """

        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        with self.assertRaises(FinetuneError):
            model.fit(train_sample.Text, train_sample.Target[:1])

        model.fit(train_sample.Text.values, train_sample.Target.values)

        predictions = model.predict(valid_sample.Text.values)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text.values)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Ejemplo n.º 4
0
 def test_correct_cached_predict(self):
     model = Classifier(**self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text.values, train_sample.Target.values)
     predictions = model.predict_proba(valid_sample.Text[:1].values)
     predictions2 = model.predict_proba(valid_sample.Text[1:2].values)
     with model.cached_predict():
         np.testing.assert_allclose(
             list(
                 model.predict_proba(
                     valid_sample.Text[:1].values)[0].values()),
             list(predictions[0].values()),
             rtol=1e-4,
         )
         np.testing.assert_allclose(
             list(
                 model.predict_proba(
                     valid_sample.Text[1:2].values)[0].values()),
             list(predictions2[0].values()),
             rtol=1e-4,
         )
Ejemplo n.º 5
0
    def test_correct_cached_predict(self):
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Test with different sizes to make sure we handle cases where
        # the data doesn't divide evenly into batches
        half_sample = int(self.n_sample / 2)
        quarter_sample = int(half_sample / 2)

        model.fit(train_sample.Text.values, train_sample.Target.values)

        # Predictions w/o cached predict
        preds = [
            model.predict_proba(valid_sample.Text.values[:half_sample]),
            model.predict_proba(valid_sample.Text.values[half_sample:]),
            model.predict_proba(valid_sample.Text.values[:quarter_sample]),
            model.predict_proba(valid_sample.Text.values[quarter_sample:]),
        ]

        # Predictions w/ cached predict
        with model.cached_predict():
            cached_preds = [
                model.predict_proba(valid_sample.Text.values[:half_sample]),
                model.predict_proba(valid_sample.Text.values[half_sample:]),
                model.predict_proba(valid_sample.Text.values[:quarter_sample]),
                model.predict_proba(valid_sample.Text.values[quarter_sample:]),
            ]

        for batch_preds, batch_cached_preds in zip(preds, cached_preds):
            for pred, cached_pred in zip(batch_preds, batch_cached_preds):
                assert list(pred.keys()) == list(cached_pred.keys())
                for pred_val, cached_pred_val in zip(pred.values(),
                                                     cached_pred.values()):
                    np.testing.assert_almost_equal(pred_val,
                                                   cached_pred_val,
                                                   decimal=4)
Ejemplo n.º 6
0
    def test_class_weights(self):
        # testing class weights
        train_sample = self.dataset.sample(n=self.n_sample * 3)
        valid_sample = self.dataset.sample(n=self.n_sample * 3)
        model = Classifier(**self.default_config())
        model.fit(train_sample.Text.values, train_sample.Target.values)
        predictions = model.predict(valid_sample.Text.values)
        recall = recall_score(valid_sample.Target.values, predictions, pos_label=1)
        model = Classifier(**self.default_config(class_weights={1: 100}))
        model.fit(train_sample.Text.values, train_sample.Target.values)
        predictions = model.predict(valid_sample.Text.values)
        new_recall = recall_score(valid_sample.Target.values, predictions, pos_label=1)
        self.assertTrue(new_recall >= recall)

        # test auto-inferred class weights function
        model = Classifier(**self.default_config(class_weights='log'))
        model.fit(train_sample.Text.values, train_sample.Target.values)
print("Starting training")
start = time.time()
model = Classifier(
    max_length=512,
    val_interval=1000,
    n_epochs=3,
    l2_reg=0.0,
    lr=6.25E-05,
    lm_loss_coef=0.25,
    #                     eval_acc = True, # doesn't work
    #                     oversample = True, # oversamples too much, so I am doing it separately
    params_device=0,
    autosave_path="/W210_Gov_Complaints_Portal/models/",
    verbose=True,
)
model.fit(trainX_res_list,
          trainY_res_list)  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117"
           )  # Serialize the model to disk
print("Model Saved")

print("Starting testing")
# model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
    def download(self):
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME)


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(verbose=True,
                       n_epochs=2,
                       val_size=0.01,
                       val_interval=10,
                       visible_gpus=[],
                       tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text,
                                                    dataset.Target,
                                                    test_size=0.3,
                                                    random_state=42)
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))
Ejemplo n.º 9
0
"""GPT2imdb.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_484wco-2YnrTKVJr5wN4qW4RDQIuKZD
"""

import pandas as pd
import finetune

url = 'https://raw.githubusercontent.com/BillGu19/Bass/master/name_genre_identifiers.csv'
name_genre = pd.read_csv(url)
name = name_genre['primaryName']
genre = name_genre['top genre']
#print(name)
#print(genre)
#print(name_genre)

from finetune.base_models import BERT, BERTLarge, GPT2, GPT2Medium, GPT2Large, TextCNN, TCN, RoBERTa, DistilBERT
from finetune import Classifier
from finetune import LanguageModel

#X = ['german shepherd', 'maine coon', 'persian', 'beagle']
#Y = ['dog', 'cat', 'cat', 'dog']
model = Classifier(base_model=GPT2)
model.fit(name, genre)

testX = ['Tom Cruise','Jamie Lee Curtis', 'Claire Danes', 'Geena Davis', 'Robert De Niro', 'John Denver', 'Johnny Depp', 'Leonardo DiCaprio', 'Clint Eastwood']
predictions= model.predict(testX)
print(predictions)
Ejemplo n.º 10
0
DATA_PATH = Path('./data')
MODELS_PATH = Path('./models')
MODELS_PATH.mkdir(exist_ok=True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--nrows', default=147618, type=int,
                        help='Define number of posts to be used to perform unsupervised finetuning of language model, defaults to all posts available (147618)')
    parser.add_argument('--name', type=str, 
                        help='Name of model to be saved in ./models directory')
    parser.add_argument('--labeled', action='store_true',
                        help='Use only labeled posts for finetuning')
    args = parser.parse_args()

    # read in data and select sample based on CLI args
    posts_df = pd.read_csv(DATA_PATH/'processed'/'all_posts_data.csv', usecols=['post_id', 'cleaned_body', 'label', 'predict_me'])

    if args.labeled:
        posts_sample = posts_df[(posts_df.label.notnull()) | posts_df.predict_me]
    else:
        posts_sample = posts_df.sample(n=args.nrows, random_state=42)     

    texts = list(posts_sample.cleaned_body.astype(str))
    print(f'{len(texts)} posts will be used to finetune the GPT language model')

    model = Classifier(batch_size=8)
    model.fit(texts)

    model.save(MODELS_PATH / args.name)
Ejemplo n.º 11
0
print(data3.shape)
print(data3.loc[82480])

mask = (data3['description'].str.len() >=
        20) & (data3['description'].str.len() <= 512)
dataFiltered = data3.loc[mask]
print(dataFiltered.shape)

dataFiltered.columns[dataFiltered.isna().any()].tolist()
# ourLabel doesn't have NaN values, so that is good.

trainingData = dataFiltered[["description", "OurLabel"]]
print(type(trainingData))
print(trainingData.shape)
trainX, testX, trainY, testY = train_test_split(trainingData.description,
                                                trainingData.OurLabel,
                                                test_size=0.2,
                                                random_state=42)
# bigMask = (trainingData["description"].str.len() >=1000)
# print(trainingData.loc[bigMask].shape)
# Split in train and test 80/20
print(trainX.shape)
print(type(trainX))
print(trainY.shape)

model = Classifier(max_length=512, val_interval=3000,
                   verbose=True)  # Load base model
model.fit(trainX, trainY)  # Finetune base model on custom data

model.save("newModel")  # Serialize the model to disk
print("Prepared a stratified sample.")

trainX, testX, trainY, testY = train_test_split(sampleX,
                                                sampleY,
                                                test_size=0.2,
                                                random_state=42,
                                                stratify=sampleY)
print(trainX.shape)
print("Split into train and test")

print("Starting training")
print(trainX.shape)
start = time.time()
model = Classifier(max_length=512, val_interval=3000,
                   verbose=True)  # Load base model
model.fit(trainX.tolist(),
          trainY.tolist())  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save("combined_model_20181018")  # Serialize the model to disk
print("Model Saved")

# model = Classifier.load("../models/combined_model_20181018")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
print("Predictions done")
print("It took :" + str(duration) + " seconds")
Ejemplo n.º 13
0
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME
        )


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=200).dataframe
    pre_train_generator = lambda: iter(StanfordSentimentTreebank(nrows=5000).dataframe.Text.values)
    model = Classifier(n_epochs=3, batch_size=2, lr_warmup=0.1, tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text.values, dataset.Target.values, test_size=0.3, random_state=42)
    model.config.dataset_size = 5000
    model.config.val_size = 100
    model.config.val_interval = 1000
    model.config.batch_size = 5
    model.fit(pre_train_generator)
    model.config.val_size = None
    model.config.val_interval = None
    #model.config.dataset_size = 1000 # This is automatically set as trainX has len
    model.config.batch_size = 2
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))
Ejemplo n.º 14
0
class StanfordSentimentTreebank(Dataset):

    def __init__(self, filename=None, **kwargs):
        super().__init__(filename=(filename or DATA_PATH), **kwargs)

    def md5(self):
        return CHECKSUM
        
    def download(self):
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME
        )


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42)
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))