def test_validation(self): """ Ensure validation settings do not result in an error """ config = self.default_config(val_interval=10, val_size=10) model = Classifier(**config) train_sample = self.dataset.sample(n=20) model.fit(train_sample.Text, train_sample.Target)
def test_fit_predict_batch_size_1(self): """ Ensure training is possible with batch size of 1 """ model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target)
def test_fit_with_eval_acc(self): """ Test issue #263 """ model = Classifier(**self.default_config(batch_size=3, eval_acc=True)) train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target)
def test_oversample(self): """ Ensure model training does not error out when oversampling is set to True """ model = Classifier(**self.default_config()) model.config.oversample = True train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values)
def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Test with different sizes to make sure we handle cases where # the data doesn't divide evenly into batches half_sample = int(self.n_sample / 2) quarter_sample = int(half_sample / 2) model.fit(train_sample.Text.values, train_sample.Target.values) # Predictions w/o cached predict preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]) ] # Predictions w/ cached predict with model.cached_predict(): cached_preds = [ model.predict_proba(valid_sample.Text.values[:half_sample]), model.predict_proba(valid_sample.Text.values[half_sample:]), model.predict_proba(valid_sample.Text.values[:quarter_sample]), model.predict_proba(valid_sample.Text.values[quarter_sample:]) ] for batch_preds, batch_cached_preds in zip(preds, cached_preds): for pred, cached_pred in zip(batch_preds, batch_cached_preds): assert list(pred.keys()) == list(cached_pred.keys()) for pred_val, cached_pred_val in zip(pred.values(), cached_pred.values()): np.testing.assert_almost_equal(pred_val, cached_pred_val, decimal=4)
def test_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ model = Classifier(verbose=False) lm_out = model.generate_text("", max_length=5) self.assertEqual(type(lm_out), str) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
def setUpClass(cls): cls._download_data() #dataset preparation cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10) path = os.path.join(os.path.dirname(__file__), "data", "testdata.json") with open(path, 'rt') as fp: cls.texts, cls.labels = json.load(fp) cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] #train and save sequence labeler for later use try: cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls)) except FileNotFoundError: cls.s = SequenceLabeler(**cls.default_seq_config(cls)) cls.s.fit(cls.texts * 10, cls.labels * 10) cls.s.save(cls.sequence_labeler_path) #train and save classifier for later use train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10) try: cls.cl = Classifier.load(cls.classifier_path) except FileNotFoundError: cls.cl = Classifier(**cls.default_config(cls)) cls.cl.fit(train_sample.Text, train_sample.Target) cls.cl.save(cls.classifier_path) if cls.do_comparison: #train and save comparison regressor for use cls.cr = ComparisonRegressor() n_per = 150 similar = [] different = [] for dataset in [cls.animals, cls.numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(cls.animals), random.choice(cls.numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42) try: cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls)) except FileNotFoundError: cls.cr = ComparisonRegressor(**cls.default_config(cls)) cls.cr.fit(cls.x_tr, cls.t_tr) cls.cr.save(cls.comparison_regressor_path)
def test_early_termination_lm(self): model = Classifier(verbose=False) # A dirty mock to make all model inferences output a hundred _classify_ tokens def load_mock(*args, **kwargs): model.sess = MagicMock() model.sess.run = MagicMock(return_value=100 * [model.encoder['_classify_']]) model.saver.initialize = load_mock lm_out = model.generate_text() self.assertEqual(lm_out, '_start__classify_')
def test_bert_featurize(self): model = Classifier(base_model=BERT) np.testing.assert_allclose( model.featurize(self.TEST_DATA)[0], np.load( os.path.join( DIRECTORY, 'data/test-bert-activations.npy' ) ), atol=1e-1 )
def test_roberta_featurize(self): model = Classifier(base_model=RoBERTa) np.testing.assert_allclose( model.featurize_sequence(self.TEST_DATA)[:,:6,:], np.load( os.path.join( DIRECTORY, 'data/test-roberta-activations.npy' ) ), atol=1e-1 )
def test_auxiliary_classifier(self): """ Ensure model training does not error out Ensure model returns predictions """ (trainX, testX, trainY, _) = self.dataset trainY = [ random.randint(0, 1) for _ in range(len(trainY)) ] # random labels just to make sure there are no errors -> reasonable predictions tests are in sequence_label model = Classifier(**self.default_config()) model.fit(trainX, trainY) _ = model.predict(testX)
def test_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ model = Classifier() lm_out = model.generate_text("", max_length=5) self.assertEqual(type(lm_out), str) lm_out_2 = model.generate_text("Indico RULE").lower() self.assertEqual(type(lm_out_2), str) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
def test_reasonable_predictions(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(config=self.default_config()) n_per_class = (self.n_sample * 5) trX = ['cat'] * n_per_class + ['finance'] * n_per_class trY = copy(trX) teX = ['feline'] * n_per_class + ['investment'] * n_per_class teY = ['cat'] * n_per_class + ['finance'] * n_per_class model.fit(trX, trY) predY = model.predict(teX) self.assertEqual(accuracy_score(teY, predY), 1.00)
def test_early_termination_lm(self): model = Classifier(verbose=False) # A dirty mock to make all model inferences output a hundred _classify_ tokens fake_estimator = MagicMock() model.get_estimator = lambda *args, **kwargs: fake_estimator fake_estimator.predict = MagicMock( return_value=iter([{ "GEN_TEXT": 100 * [ENCODER['_classify_']] }])) lm_out = model.generate_text() self.assertEqual(lm_out, '_start__classify_')
def test_reasonable_predictions_smaller_model(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(base_model=GPTModelSmall) n_per_class = (self.n_sample * 5) trX = ['cat'] * n_per_class + ['finance'] * n_per_class np.random.shuffle(trX) trY = copy(trX) teX = ['feline'] * n_per_class + ['investment'] * n_per_class teY = ['cat'] * n_per_class + ['finance'] * n_per_class model.fit(trX, trY) predY = model.predict(teX) self.assertEqual(accuracy_score(teY, predY), 1.00)
async def classifyOpen311Complaint(request): global model # Check if data provided if request.json == None: return json({"result", "No data in request"}) # Check if we have a 311 'description' field if request.json.get('description') == None and request.json.get( 'descriptions') == None: return json({'service_code': 'unknown'}) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose=True) model = Classifier.load("/root/combined_model_20181021") if request.json.get('descriptions') != None: processedComplaints = list( map(lambda x: preProcess(x), request.json.get('descriptions'))) prediction = model.predict(processedComplaints).tolist() else: print("Doing simple prediction") #prediction = model.predict([preProcess(request.json.get('description'))])[0] prediction_proba = model.predict_proba( [preProcess(request.json.get('description'))])[0] print("Probabilities: ", prediction_proba) prediction = max(prediction_proba, key=prediction_proba.get) # has to be a string otherwise sanic crashes prediction_value = str(prediction_proba[prediction]) print("Top probability: %s, at %s" % (prediction, prediction_value)) print("Prediction is: ", prediction) # If we have a service_code in the incoming request then we assume an Open311 message, # so we update the service_code and return the full message. Otherwise we just send # back a new message with the service_code only if request.json.get('service_code') == None: print("No service code provided, returning one") return json({ 'service_code': prediction, 'service_code_proba': prediction_value }) else: print("Service_code was provided so updating it") request.json['service_code'] = prediction request.json['service_code_proba'] = prediction_value print(request.json) return json(request.json)
def test_generate_text_stop_early(self): model = Classifier() # A dirty mock to make all model inferences output a hundred _classify_ tokens fake_estimator = MagicMock() model.get_estimator = lambda *args, **kwargs: (fake_estimator, []) model.input_pipeline.text_encoder._lazy_init() fake_estimator.predict = MagicMock(return_value=iter([{ PredictMode.GENERATE_TEXT: 100 * [model.input_pipeline.text_encoder["_classify_"]] }])) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] lm_out = model.generate_text(use_extra_toks=True) self.assertEqual(lm_out, "{}_classify_".format(start_token))
def generate_GPT_feats(model_path, post_level=True): if post_level: df = pd.read_csv(PROCESSED_PATH / 'all_posts_data.csv') df = df[df.predict_me | (df.label.notnull())].loc[:, ['post_id', 'cleaned_body']] else: df = pd.read_csv(PROCESSED_PATH / 'sentences.csv') df = df.rename(columns={'body': 'cleaned_body'}) model = Classifier.load(model_path) texts_to_featurize = list(df.cleaned_body.astype(str)) features = model.featurize(texts_to_featurize) # generate a df with features as cols, with index as post_id GPT_embeddings = pd.DataFrame(features) GPT_embeddings.index = df.post_id if post_level: GPT_embeddings = GPT_embeddings.add_prefix('post_lvl-') else: GPT_embeddings = GPT_embeddings.add_prefix('sentence_lvl-') GPT_embeddings = flatten_cols( GPT_embeddings.groupby('post_id').agg(['mean', 'max', 'min'])) return GPT_embeddings
class FinetuneClfBaselineNonRationalized(ClassificationExperiment): param_grid = {} def __init__(self, *args, **kwargs): """Initialize internal classifier.""" super().__init__(auto_resample=False, *args, **kwargs) self.model = Classifier(val_size=0) def fit(self, X, y): self.model.fit(*self.resample(X, [yi[1] for yi in y])) def predict(self, X, **kwargs): preds = self.model.predict_proba(X) return pd.DataFrame.from_records(preds) def cleanup(self): del self.model
def test_reasonable_predictions(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(**self.default_config(n_epochs=5)) n_duplicates = 5 trX = ( ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates + ["finance", "investment", "investing", "dividends", "financial"] * n_duplicates) trY = (['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2)) teX = ["furball", "fiduciary"] teY = ["cat"] + ["finance"] model.fit(trX, trY) predY = model.predict(teX) print(predY) self.assertEqual(accuracy_score(teY, predY), 1.00)
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(verbose=False) train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_correct_cached_predict(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict_proba(valid_sample.Text[:1].values) predictions2 = model.predict_proba(valid_sample.Text[1:2].values) with model.cached_predict(): np.testing.assert_allclose(list( model.predict_proba(valid_sample.Text[:1].values)[0].values()), list(predictions[0].values()), rtol=1e-4) np.testing.assert_allclose(list( model.predict_proba( valid_sample.Text[1:2].values)[0].values()), list(predictions2[0].values()), rtol=1e-4)
def get_bert_model(batch_size, maxlen, dsize, save_path): model = Classifier(base_model=BERT, batch_size=batch_size, n_epochs=2, max_length=maxlen, lr_schedule='warmup_linear', dataset_size=dsize, val_size=0.1, autosave_path=save_path, class_weights='sqrt') return model
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) model.fit(self.trainX, self.trainY, context=self.train_context) predictions = model.predict(self.trainX, context=self.train_context) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(self.trainX, context=self.train_context) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) (trainX, testX, trainY, _) = self.dataset trainY = [random.randint(0, 1) for _ in range(len(trainY))] model.fit(trainX, trainY) predictions = model.predict(testX) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(testX) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_cached_predict(self): """ Ensure second call to predict is faster than first """ model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) with model.cached_predict(): start = time.time() model.predict(valid_sample.Text[:1].values) first = time.time() model.predict(valid_sample.Text[:1].values) second = time.time() first_prediction_time = first - start second_prediction_time = second - first self.assertLess(second_prediction_time, first_prediction_time / 2.0)
def test_classifier_auxiliary(self): """ Ensure model training does not error out Ensure model returns predictions """ model = Classifier(**self.default_config()) model.fit(self.trainX, self.trainY, context=self.train_context) _ = model.predict(self.trainX, context=self.train_context) # test cached predict _ = model.predict(self.trainX, context=self.train_context)
def test_fit_predict(self): dataset = StanfordSentimentTreebank(nrows=50).dataframe q_dataset = QuoraDuplicate(nrows=50).dataframe model = MultiTask( tasks={ "sst": Classifier, "qqp": Comparison }, n_epochs=2, optimizer="AdamaxW", max_length=200, ) q_X1, q_X2, q_Y = q_dataset.Text1.values, q_dataset.Text2.values, q_dataset.Target.values trainX, testX, trainY, testY = train_test_split(dataset.Text.values, dataset.Target.values, test_size=0.3, random_state=42) model.fit({ "sst": trainX, "qqp": list(zip(q_X1, q_X2)), }, { "sst": trainY, "qqp": q_Y, }) model.featurize({ "sst": testX, "qqp": list(zip(q_X1, q_X2))[:10], }) preds = model.predict({ "sst": testX, "qqp": list(zip(q_X1, q_X2))[:10], }) self.assertIn("sst", preds) self.assertIn("qqp", preds) model.create_base_model("./test_base_mtl.jl", exists_ok=True) model = Classifier(base_model_path="./test_base_mtl.jl", max_length=200) model.fit(trainX, trainY) os.remove(finetune_model_path("./test_base_mtl.jl"))
class StanfordSentimentTreebank(Dataset): def __init__(self, filename=None, **kwargs): super().__init__(filename=(filename or DATA_PATH), **kwargs) def md5(self): return CHECKSUM def download(self): """ Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME ) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))