Esempio n. 1
0
    def test_correct_cached_predict(self):
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Test with different sizes to make sure we handle cases where
        # the data doesn't divide evenly into batches
        half_sample = int(self.n_sample / 2)
        quarter_sample = int(half_sample / 2)

        model.fit(train_sample.Text.values, train_sample.Target.values)

        # Predictions w/o cached predict
        preds = [
            model.predict_proba(valid_sample.Text.values[:half_sample]),
            model.predict_proba(valid_sample.Text.values[half_sample:]),
            model.predict_proba(valid_sample.Text.values[:quarter_sample]),
            model.predict_proba(valid_sample.Text.values[quarter_sample:])
        ]

        # Predictions w/ cached predict
        with model.cached_predict():
            cached_preds = [
                model.predict_proba(valid_sample.Text.values[:half_sample]),
                model.predict_proba(valid_sample.Text.values[half_sample:]),
                model.predict_proba(valid_sample.Text.values[:quarter_sample]),
                model.predict_proba(valid_sample.Text.values[quarter_sample:])
            ]

        for batch_preds, batch_cached_preds in zip(preds, cached_preds):
            for pred, cached_pred in zip(batch_preds, batch_cached_preds):
                assert list(pred.keys()) == list(cached_pred.keys())
                for pred_val, cached_pred_val in zip(pred.values(), cached_pred.values()):
                    np.testing.assert_almost_equal(pred_val, cached_pred_val, decimal=4)
Esempio n. 2
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with only text
        model.fit(train_sample.Text)

        # Save and reload check
        save_file = 'tests/saved-models/test-save-load'
        model.save(save_file)
        model = Classifier.load(save_file)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Esempio n. 3
0
class Finetune(ClassificationExperiment):
    """
    LanguageModel finetuning as an alternative to simple models trained on top of pretrained features.
    """

    param_grid = {}

    def __init__(self, *args, **kwargs):
        """Initialize internal classifier."""
        super().__init__(*args, **kwargs)
        self.model = Classifier(val_size=0)

    def fit(self, X, y):
        """
        :param X: `np.ndarray` of raw text sampled from training data.
        :param y: `np.ndarray` of corresponding targets sampled from training data.
        """
        self.model.fit(X, y)

    def predict(self, X, **kwargs):
        """Predict results on test set based on current internal model."""
        preds = self.model.predict_proba(X)
        return pd.DataFrame.from_records(preds)

    def cleanup(self):
        del self.model
Esempio n. 4
0
 def test_correct_cached_predict(self):
     model = Classifier(**self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text.values, train_sample.Target.values)
     predictions = model.predict_proba(valid_sample.Text[:1].values)
     predictions2 = model.predict_proba(valid_sample.Text[1:2].values)
     with model.cached_predict():
         np.testing.assert_allclose(list(
             model.predict_proba(valid_sample.Text[:1].values)[0].values()),
                                    list(predictions[0].values()),
                                    rtol=1e-4)
         np.testing.assert_allclose(list(
             model.predict_proba(
                 valid_sample.Text[1:2].values)[0].values()),
                                    list(predictions2[0].values()),
                                    rtol=1e-4)
Esempio n. 5
0
async def classifyOpen311Complaint(request):
    global model

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})

    # Check if we have a 311 'description' field
    if request.json.get('description') == None and request.json.get(
            'descriptions') == None:
        return json({'service_code': 'unknown'})

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose=True)
        model = Classifier.load("/root/combined_model_20181021")

    if request.json.get('descriptions') != None:
        processedComplaints = list(
            map(lambda x: preProcess(x), request.json.get('descriptions')))
        prediction = model.predict(processedComplaints).tolist()
    else:
        print("Doing simple prediction")
        #prediction = model.predict([preProcess(request.json.get('description'))])[0]
        prediction_proba = model.predict_proba(
            [preProcess(request.json.get('description'))])[0]
        print("Probabilities: ", prediction_proba)
        prediction = max(prediction_proba, key=prediction_proba.get)
        # has to be a string otherwise sanic crashes
        prediction_value = str(prediction_proba[prediction])
        print("Top probability: %s, at %s" % (prediction, prediction_value))

    print("Prediction is: ", prediction)

    # If we have a service_code in the incoming request then we assume an Open311 message,
    # so we update the service_code and return the full message.  Otherwise we just send
    # back a new message with the service_code only
    if request.json.get('service_code') == None:
        print("No service code provided, returning one")
        return json({
            'service_code': prediction,
            'service_code_proba': prediction_value
        })
    else:
        print("Service_code was provided so updating it")
        request.json['service_code'] = prediction
        request.json['service_code_proba'] = prediction_value
        print(request.json)
        return json(request.json)
Esempio n. 6
0
class FinetuneClfBaselineNonRationalized(ClassificationExperiment):
    param_grid = {}
    
    def __init__(self, *args, **kwargs):
        """Initialize internal classifier."""
        super().__init__(auto_resample=False, *args, **kwargs)
        self.model = Classifier(val_size=0)
        
    def fit(self, X, y):
        self.model.fit(*self.resample(X, [yi[1] for yi in y]))
        
    def predict(self, X, **kwargs):
        preds = self.model.predict_proba(X)
        return pd.DataFrame.from_records(preds)

    def cleanup(self):
        del self.model
Esempio n. 7
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        """

        model = Classifier(config=self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)

        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Esempio n. 8
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)

        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Esempio n. 9
0
    def test_chunk_long_sequences(self):
        test_sequence = [
            "This is a sentence to test chunk_long_sequences in classification. " * 20,
            "Another example so now there are two different classes in the test. " * 20,
        ]
        labels = ["a", "b"]
        model = Classifier()
        model.config.chunk_long_sequences = True
        model.config.max_length = 18

        model.finetune(test_sequence * 10, labels * 10)

        predictions = model.predict(test_sequence * 10)
        probas = model.predict_proba(test_sequence * 10)

        self.assertEqual(len(predictions), 20)
        self.assertEqual(len(probas[0]), 2)
        np.testing.assert_almost_equal(np.sum(list(probas[0].values())), 1, decimal=4)