def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Test with different sizes to make sure we handle cases where # the data doesn't divide evenly into batches half_sample = int(self.n_sample / 2) quarter_sample = int(half_sample / 2) model.fit(train_sample.Text.values, train_sample.Target.values) # Predictions w/o cached predict preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]) ] # Predictions w/ cached predict with model.cached_predict(): cached_preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]) ] for batch_preds, batch_cached_preds in zip(preds, cached_preds): for pred, cached_pred in zip(batch_preds, batch_cached_preds): assert list(pred.keys()) == list(cached_pred.keys()) for pred_val, cached_pred_val in zip(pred.values(), cached_pred.values()): np.testing.assert_almost_equal(pred_val, cached_pred_val, decimal=4)
def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with only text model.fit(train_sample.Text) # Save and reload check save_file = 'tests/saved-models/test-save-load' model.save(save_file) model = Classifier.load(save_file) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
class Finetune(ClassificationExperiment): """ LanguageModel finetuning as an alternative to simple models trained on top of pretrained features. """ param_grid = {} def __init__(self, *args, **kwargs): """Initialize internal classifier.""" super().__init__(*args, **kwargs) self.model = Classifier(val_size=0) def fit(self, X, y): """ :param X: `np.ndarray` of raw text sampled from training data. :param y: `np.ndarray` of corresponding targets sampled from training data. """ self.model.fit(X, y) def predict(self, X, **kwargs): """Predict results on test set based on current internal model.""" preds = self.model.predict_proba(X) return pd.DataFrame.from_records(preds) def cleanup(self): del self.model
def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict_proba(valid_sample.Text[:1].values) predictions2 = model.predict_proba(valid_sample.Text[1:2].values) with model.cached_predict(): np.testing.assert_allclose(list( model.predict_proba(valid_sample.Text[:1].values)[0].values()), list(predictions[0].values()), rtol=1e-4) np.testing.assert_allclose(list( model.predict_proba( valid_sample.Text[1:2].values)[0].values()), list(predictions2[0].values()), rtol=1e-4)
async def classifyOpen311Complaint(request): global model # Check if data provided if request.json == None: return json({"result", "No data in request"}) # Check if we have a 311 'description' field if request.json.get('description') == None and request.json.get( 'descriptions') == None: return json({'service_code': 'unknown'}) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose=True) model = Classifier.load("/root/combined_model_20181021") if request.json.get('descriptions') != None: processedComplaints = list( map(lambda x: preProcess(x), request.json.get('descriptions'))) prediction = model.predict(processedComplaints).tolist() else: print("Doing simple prediction") #prediction = model.predict([preProcess(request.json.get('description'))])[0] prediction_proba = model.predict_proba( [preProcess(request.json.get('description'))])[0] print("Probabilities: ", prediction_proba) prediction = max(prediction_proba, key=prediction_proba.get) # has to be a string otherwise sanic crashes prediction_value = str(prediction_proba[prediction]) print("Top probability: %s, at %s" % (prediction, prediction_value)) print("Prediction is: ", prediction) # If we have a service_code in the incoming request then we assume an Open311 message, # so we update the service_code and return the full message. Otherwise we just send # back a new message with the service_code only if request.json.get('service_code') == None: print("No service code provided, returning one") return json({ 'service_code': prediction, 'service_code_proba': prediction_value }) else: print("Service_code was provided so updating it") request.json['service_code'] = prediction request.json['service_code_proba'] = prediction_value print(request.json) return json(request.json)
class FinetuneClfBaselineNonRationalized(ClassificationExperiment): param_grid = {} def __init__(self, *args, **kwargs): """Initialize internal classifier.""" super().__init__(auto_resample=False, *args, **kwargs) self.model = Classifier(val_size=0) def fit(self, X, y): self.model.fit(*self.resample(X, [yi[1] for yi in y])) def predict(self, X, **kwargs): preds = self.model.predict_proba(X) return pd.DataFrame.from_records(preds) def cleanup(self): del self.model
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions of the right type """ model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_chunk_long_sequences(self): test_sequence = [ "This is a sentence to test chunk_long_sequences in classification. " * 20, "Another example so now there are two different classes in the test. " * 20, ] labels = ["a", "b"] model = Classifier() model.config.chunk_long_sequences = True model.config.max_length = 18 model.finetune(test_sequence * 10, labels * 10) predictions = model.predict(test_sequence * 10) probas = model.predict_proba(test_sequence * 10) self.assertEqual(len(predictions), 20) self.assertEqual(len(probas[0]), 2) np.testing.assert_almost_equal(np.sum(list(probas[0].values())), 1, decimal=4)