Example #1
0
 def test_validation(self):
     """
     Ensure validation settings do not result in an error
     """
     config = self.default_config(val_interval=10, val_size=10)
     model = Classifier(**config)
     train_sample = self.dataset.sample(n=20)
     model.fit(train_sample.Text, train_sample.Target)
Example #2
0
 def test_fit_predict_batch_size_1(self):
     """
     Ensure training is possible with batch size of 1
     """
     model = Classifier(config=self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
Example #3
0
    def test_fit_with_eval_acc(self):
        """
        Test issue #263
        """

        model = Classifier(**self.default_config(batch_size=3, eval_acc=True))
        train_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
Example #4
0
    def test_oversample(self):
        """
        Ensure model training does not error out when oversampling is set to True
        """

        model = Classifier(**self.default_config())
        model.config.oversample = True
        train_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text.values, train_sample.Target.values)
Example #5
0
    def test_correct_cached_predict(self):
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Test with different sizes to make sure we handle cases where
        # the data doesn't divide evenly into batches
        half_sample = int(self.n_sample / 2)
        quarter_sample = int(half_sample / 2)

        model.fit(train_sample.Text.values, train_sample.Target.values)

        # Predictions w/o cached predict
        preds = [
            model.predict_proba(valid_sample.Text.values[:half_sample]),
            model.predict_proba(valid_sample.Text.values[half_sample:]),
            model.predict_proba(valid_sample.Text.values[:quarter_sample]),
            model.predict_proba(valid_sample.Text.values[quarter_sample:])
        ]

        # Predictions w/ cached predict
        with model.cached_predict():
            cached_preds = [
                model.predict_proba(valid_sample.Text.values[:half_sample]),
                model.predict_proba(valid_sample.Text.values[half_sample:]),
                model.predict_proba(valid_sample.Text.values[:quarter_sample]),
                model.predict_proba(valid_sample.Text.values[quarter_sample:])
            ]

        for batch_preds, batch_cached_preds in zip(preds, cached_preds):
            for pred, cached_pred in zip(batch_preds, batch_cached_preds):
                assert list(pred.keys()) == list(cached_pred.keys())
                for pred_val, cached_pred_val in zip(pred.values(), cached_pred.values()):
                    np.testing.assert_almost_equal(pred_val, cached_pred_val, decimal=4)
Example #6
0
 def test_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     model = Classifier(verbose=False)
     lm_out = model.generate_text("", max_length=5)
     self.assertEqual(type(lm_out), str)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
    def setUpClass(cls):
        cls._download_data()
        
        #dataset preparation
        cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10)

        path = os.path.join(os.path.dirname(__file__), "data", "testdata.json")
        with open(path, 'rt') as fp:
            cls.texts, cls.labels = json.load(fp)

        cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"]
        cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
        
        #train and save sequence labeler for later use
        try:
            cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls))
        except FileNotFoundError:
            cls.s = SequenceLabeler(**cls.default_seq_config(cls))
            cls.s.fit(cls.texts * 10, cls.labels * 10)
            cls.s.save(cls.sequence_labeler_path)
        
        #train and save classifier for later use
        train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10)
        try:
            cls.cl = Classifier.load(cls.classifier_path)
        except FileNotFoundError:
            cls.cl = Classifier(**cls.default_config(cls))
            cls.cl.fit(train_sample.Text, train_sample.Target)
            cls.cl.save(cls.classifier_path)

        if cls.do_comparison:
            #train and save comparison regressor for use
            cls.cr = ComparisonRegressor()
    
            n_per = 150
            similar = []
            different = []
            for dataset in [cls.animals, cls.numbers]:
                for i in range(n_per // 2):
                    similar.append([random.choice(dataset), random.choice(dataset)])
            for i in range(n_per):
                different.append([random.choice(cls.animals), random.choice(cls.numbers)])

            targets = np.asarray([1] * len(similar) + [0] * len(different))
            data = similar + different

            cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42)
            
            try:
                cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls))
            except FileNotFoundError:
                cls.cr = ComparisonRegressor(**cls.default_config(cls))
                cls.cr.fit(cls.x_tr, cls.t_tr)
                cls.cr.save(cls.comparison_regressor_path)
Example #8
0
    def test_early_termination_lm(self):
        model = Classifier(verbose=False)

        # A dirty mock to make all model inferences output a hundred _classify_ tokens
        def load_mock(*args, **kwargs):
            model.sess = MagicMock()
            model.sess.run = MagicMock(return_value=100 *
                                       [model.encoder['_classify_']])

        model.saver.initialize = load_mock
        lm_out = model.generate_text()
        self.assertEqual(lm_out, '_start__classify_')
Example #9
0
 def test_bert_featurize(self):
     model = Classifier(base_model=BERT)
     np.testing.assert_allclose(
         model.featurize(self.TEST_DATA)[0], 
         np.load(
             os.path.join(
                 DIRECTORY, 
                 'data/test-bert-activations.npy'
             )
         ),
         atol=1e-1
     )
Example #10
0
 def test_roberta_featurize(self):
     model = Classifier(base_model=RoBERTa)
     np.testing.assert_allclose(
         model.featurize_sequence(self.TEST_DATA)[:,:6,:], 
         np.load(
             os.path.join(
                 DIRECTORY, 
                 'data/test-roberta-activations.npy'
             )
         ),
         atol=1e-1
     )
Example #11
0
 def test_auxiliary_classifier(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     (trainX, testX, trainY, _) = self.dataset
     trainY = [
         random.randint(0, 1) for _ in range(len(trainY))
     ]  # random labels just to make sure there are no errors -> reasonable predictions tests are in sequence_label
     model = Classifier(**self.default_config())
     model.fit(trainX, trainY)
     _ = model.predict(testX)
Example #12
0
 def test_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     model = Classifier()
     lm_out = model.generate_text("", max_length=5)
     self.assertEqual(type(lm_out), str)
     lm_out_2 = model.generate_text("Indico RULE").lower()
     self.assertEqual(type(lm_out_2), str)
     start_id = model.input_pipeline.text_encoder.start
     start_token = model.input_pipeline.text_encoder.decoder[start_id]
     self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
Example #13
0
 def test_reasonable_predictions(self):
     """
     Ensure model converges to a reasonable solution for a trivial problem
     """
     model = Classifier(config=self.default_config())
     n_per_class = (self.n_sample * 5)
     trX = ['cat'] * n_per_class + ['finance'] * n_per_class
     trY = copy(trX)
     teX = ['feline'] * n_per_class + ['investment'] * n_per_class
     teY = ['cat'] * n_per_class + ['finance'] * n_per_class
     model.fit(trX, trY)
     predY = model.predict(teX)
     self.assertEqual(accuracy_score(teY, predY), 1.00)
Example #14
0
    def test_early_termination_lm(self):
        model = Classifier(verbose=False)

        # A dirty mock to make all model inferences output a hundred _classify_ tokens
        fake_estimator = MagicMock()
        model.get_estimator = lambda *args, **kwargs: fake_estimator
        fake_estimator.predict = MagicMock(
            return_value=iter([{
                "GEN_TEXT": 100 * [ENCODER['_classify_']]
            }]))

        lm_out = model.generate_text()
        self.assertEqual(lm_out, '_start__classify_')
Example #15
0
 def test_reasonable_predictions_smaller_model(self):
     """
     Ensure model converges to a reasonable solution for a trivial problem
     """
     model = Classifier(base_model=GPTModelSmall)
     n_per_class = (self.n_sample * 5)
     trX = ['cat'] * n_per_class + ['finance'] * n_per_class
     np.random.shuffle(trX)
     trY = copy(trX)
     teX = ['feline'] * n_per_class + ['investment'] * n_per_class
     teY = ['cat'] * n_per_class + ['finance'] * n_per_class
     model.fit(trX, trY)
     predY = model.predict(teX)
     self.assertEqual(accuracy_score(teY, predY), 1.00)
Example #16
0
async def classifyOpen311Complaint(request):
    global model

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})

    # Check if we have a 311 'description' field
    if request.json.get('description') == None and request.json.get(
            'descriptions') == None:
        return json({'service_code': 'unknown'})

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose=True)
        model = Classifier.load("/root/combined_model_20181021")

    if request.json.get('descriptions') != None:
        processedComplaints = list(
            map(lambda x: preProcess(x), request.json.get('descriptions')))
        prediction = model.predict(processedComplaints).tolist()
    else:
        print("Doing simple prediction")
        #prediction = model.predict([preProcess(request.json.get('description'))])[0]
        prediction_proba = model.predict_proba(
            [preProcess(request.json.get('description'))])[0]
        print("Probabilities: ", prediction_proba)
        prediction = max(prediction_proba, key=prediction_proba.get)
        # has to be a string otherwise sanic crashes
        prediction_value = str(prediction_proba[prediction])
        print("Top probability: %s, at %s" % (prediction, prediction_value))

    print("Prediction is: ", prediction)

    # If we have a service_code in the incoming request then we assume an Open311 message,
    # so we update the service_code and return the full message.  Otherwise we just send
    # back a new message with the service_code only
    if request.json.get('service_code') == None:
        print("No service code provided, returning one")
        return json({
            'service_code': prediction,
            'service_code_proba': prediction_value
        })
    else:
        print("Service_code was provided so updating it")
        request.json['service_code'] = prediction
        request.json['service_code_proba'] = prediction_value
        print(request.json)
        return json(request.json)
Example #17
0
    def test_generate_text_stop_early(self):
        model = Classifier()

        # A dirty mock to make all model inferences output a hundred _classify_ tokens
        fake_estimator = MagicMock()
        model.get_estimator = lambda *args, **kwargs: (fake_estimator, [])
        model.input_pipeline.text_encoder._lazy_init()
        fake_estimator.predict = MagicMock(return_value=iter([{
            PredictMode.GENERATE_TEXT:
            100 * [model.input_pipeline.text_encoder["_classify_"]]
        }]))
        start_id = model.input_pipeline.text_encoder.start
        start_token = model.input_pipeline.text_encoder.decoder[start_id]
        lm_out = model.generate_text(use_extra_toks=True)
        self.assertEqual(lm_out, "{}_classify_".format(start_token))
def generate_GPT_feats(model_path, post_level=True):
    if post_level:
        df = pd.read_csv(PROCESSED_PATH / 'all_posts_data.csv')
        df = df[df.predict_me |
                (df.label.notnull())].loc[:, ['post_id', 'cleaned_body']]
    else:
        df = pd.read_csv(PROCESSED_PATH / 'sentences.csv')
        df = df.rename(columns={'body': 'cleaned_body'})

    model = Classifier.load(model_path)
    texts_to_featurize = list(df.cleaned_body.astype(str))
    features = model.featurize(texts_to_featurize)

    # generate a df with features as cols, with index as post_id
    GPT_embeddings = pd.DataFrame(features)
    GPT_embeddings.index = df.post_id

    if post_level:
        GPT_embeddings = GPT_embeddings.add_prefix('post_lvl-')
    else:
        GPT_embeddings = GPT_embeddings.add_prefix('sentence_lvl-')
        GPT_embeddings = flatten_cols(
            GPT_embeddings.groupby('post_id').agg(['mean', 'max', 'min']))

    return GPT_embeddings
Example #19
0
class FinetuneClfBaselineNonRationalized(ClassificationExperiment):
    param_grid = {}
    
    def __init__(self, *args, **kwargs):
        """Initialize internal classifier."""
        super().__init__(auto_resample=False, *args, **kwargs)
        self.model = Classifier(val_size=0)
        
    def fit(self, X, y):
        self.model.fit(*self.resample(X, [yi[1] for yi in y]))
        
    def predict(self, X, **kwargs):
        preds = self.model.predict_proba(X)
        return pd.DataFrame.from_records(preds)

    def cleanup(self):
        del self.model
Example #20
0
    def test_reasonable_predictions(self):
        """
        Ensure model converges to a reasonable solution for a trivial problem
        """
        model = Classifier(**self.default_config(n_epochs=5))

        n_duplicates = 5

        trX = (
            ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates +
            ["finance", "investment", "investing", "dividends", "financial"] *
            n_duplicates)
        trY = (['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2))
        teX = ["furball", "fiduciary"]
        teY = ["cat"] + ["finance"]
        model.fit(trX, trY)
        predY = model.predict(teX)
        print(predY)
        self.assertEqual(accuracy_score(teY, predY), 1.00)
Example #21
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(verbose=False)
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
Example #22
0
 def test_save_load(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(config=self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     predictions = model.predict(valid_sample.Text)
     model.save(save_file)
     model = Classifier.load(save_file)
     new_predictions = model.predict(valid_sample.Text)
     for i, prediction in enumerate(predictions):
         self.assertEqual(prediction, new_predictions[i])
Example #23
0
 def test_correct_cached_predict(self):
     model = Classifier(**self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text.values, train_sample.Target.values)
     predictions = model.predict_proba(valid_sample.Text[:1].values)
     predictions2 = model.predict_proba(valid_sample.Text[1:2].values)
     with model.cached_predict():
         np.testing.assert_allclose(list(
             model.predict_proba(valid_sample.Text[:1].values)[0].values()),
                                    list(predictions[0].values()),
                                    rtol=1e-4)
         np.testing.assert_allclose(list(
             model.predict_proba(
                 valid_sample.Text[1:2].values)[0].values()),
                                    list(predictions2[0].values()),
                                    rtol=1e-4)
def get_bert_model(batch_size, maxlen, dsize, save_path):
    model = Classifier(base_model=BERT,
                       batch_size=batch_size,
                       n_epochs=2,
                       max_length=maxlen,
                       lr_schedule='warmup_linear',
                       dataset_size=dsize,
                       val_size=0.1,
                       autosave_path=save_path,
                       class_weights='sqrt')

    return model
Example #25
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        model.fit(self.trainX, self.trainY, context=self.train_context)
        predictions = model.predict(self.trainX, context=self.train_context)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(self.trainX,
                                        context=self.train_context)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
Example #26
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier()
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     start_id = model.input_pipeline.text_encoder.start
     start_token = model.input_pipeline.text_encoder.decoder[start_id]
     self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
Example #27
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        (trainX, testX, trainY, _) = self.dataset
        trainY = [random.randint(0, 1) for _ in range(len(trainY))]
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(testX)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
Example #28
0
    def test_cached_predict(self):
        """
        Ensure second call to predict is faster than first
        """

        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text.values, train_sample.Target.values)

        with model.cached_predict():
            start = time.time()
            model.predict(valid_sample.Text[:1].values)
            first = time.time()
            model.predict(valid_sample.Text[:1].values)
            second = time.time()

        first_prediction_time = first - start
        second_prediction_time = second - first
        self.assertLess(second_prediction_time, first_prediction_time / 2.0)
Example #29
0
 def test_classifier_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     model = Classifier(**self.default_config())
     model.fit(self.trainX, self.trainY, context=self.train_context)
     _ = model.predict(self.trainX, context=self.train_context)
     # test cached predict
     _ = model.predict(self.trainX, context=self.train_context)
Example #30
0
    def test_fit_predict(self):
        dataset = StanfordSentimentTreebank(nrows=50).dataframe
        q_dataset = QuoraDuplicate(nrows=50).dataframe

        model = MultiTask(
            tasks={
                "sst": Classifier,
                "qqp": Comparison
            },
            n_epochs=2,
            optimizer="AdamaxW",
            max_length=200,
        )

        q_X1, q_X2, q_Y = q_dataset.Text1.values, q_dataset.Text2.values, q_dataset.Target.values

        trainX, testX, trainY, testY = train_test_split(dataset.Text.values,
                                                        dataset.Target.values,
                                                        test_size=0.3,
                                                        random_state=42)

        model.fit({
            "sst": trainX,
            "qqp": list(zip(q_X1, q_X2)),
        }, {
            "sst": trainY,
            "qqp": q_Y,
        })

        model.featurize({
            "sst": testX,
            "qqp": list(zip(q_X1, q_X2))[:10],
        })

        preds = model.predict({
            "sst": testX,
            "qqp": list(zip(q_X1, q_X2))[:10],
        })
        self.assertIn("sst", preds)
        self.assertIn("qqp", preds)

        model.create_base_model("./test_base_mtl.jl", exists_ok=True)
        model = Classifier(base_model_path="./test_base_mtl.jl",
                           max_length=200)
        model.fit(trainX, trainY)

        os.remove(finetune_model_path("./test_base_mtl.jl"))
class StanfordSentimentTreebank(Dataset):

    def __init__(self, filename=None, **kwargs):
        super().__init__(filename=(filename or DATA_PATH), **kwargs)

    def md5(self):
        return CHECKSUM
        
    def download(self):
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME
        )


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42)
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))