async def classifyOpen311Complaint(request): global model # Check if data provided if request.json == None: return json({"result", "No data in request"}) # Check if we have a 311 'description' field if request.json.get('description') == None and request.json.get('descriptions') == None: return json({'service_code': 'unknown'}) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") if request.json.get('descriptions') != None: processedComplaints = list(map(lambda x: preProcess(x), request.json.get('descriptions'))) prediction = model.predict(processedComplaints).tolist() else: print("Doing simple prediction") prediction = model.predict([preProcess(request.json.get('description'))])[0] print("Prediction is: ", prediction) # If we have a service_code in the incoming request then we assume an Open311 message, # so we update the service_code and return the full message. Otherwise we just send # back a new message with the service_code only if request.json.get('service_code') == None: print("No service code provided, returning one") return json({'service_code': prediction}) else: print("Service_code was provided so updating it") request.json['service_code'] = prediction return json(request.json)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" save_file_fp16 = "tests/saved-models/test-save-load_fp16" config = self.default_config(save_adam_vars=False) model = Classifier(**config) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) # testing file size reduction options model.save(save_file) self.assertLess(os.stat(save_file).st_size, 500000000) # reducing floating point precision model.saver.save_dtype = np.float16 model.save(save_file_fp16) self.assertLess(os.stat(save_file_fp16).st_size, 260000000) model = Classifier.load(save_file_fp16) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_classifier_auxiliary(self): """ Ensure model training does not error out Ensure model returns predictions """ model = Classifier(**self.default_config()) model.fit(self.trainX, self.trainY, context=self.train_context) _ = model.predict(self.trainX, context=self.train_context) # test cached predict _ = model.predict(self.trainX, context=self.train_context)
def test_fit_predict_batch_size_1(self): """ Ensure training is possible with batch size of 1 """ model = Classifier(**self.default_config()) model.config.batch_size = 1 train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) model.predict(valid_sample.Text.values)
def test_classifier_no_auxiliary(self): """ Ensure model training does not error out Ensure model returns predictions """ config = self.default_config(use_auxiliary_info=False, context_dim=None, val_set=(self.trainX, self.trainY)) model = Classifier(**config) model.fit(self.trainX, self.trainY) _ = model.predict(self.trainX) # test cached predict _ = model.predict(self.trainX)
def test_multiple_models_fit_predict(self): """ Ensure second call to predict is faster than first """ model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) model.predict(valid_sample.Text.values) model2 = Classifier(**self.default_config()) model2.fit(train_sample.Text.values, train_sample.Target.values) model2.predict(valid_sample.Text.values)
def post(self): global model print("Received POST request on Open311 interface") # if the classifier has not been loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") # check if the JSON description has been filled in some_json = request.get_json() print("Received JSON: ", some_json) if some_json.get('description') == None: return {'service_code': 'unknown'} newTextDescription = some_json.get('description') print("received: ", newTextDescription) prediction = model.predict([newTextDescription]) # check if the input data also contained the service code and if so replace it # and return the original message if some_json.get('service_code') == None: # No service code so just return that print("No service code provided, returning one") return {'service_code': prediction[0]} else: print("Service_code was provided so updating it") some_json['service_code'] = prediction[0] return some_json
def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with only text model.fit(train_sample.Text) # Save and reload check save_file = 'tests/saved-models/test-save-load' model.save(save_file) model = Classifier.load(save_file) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def post(self): global model print("Received POST request on google interface") # if the classifier has not been loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") # check if the JSON description has been filled in some_json = request.get_json() if some_json.get('queryResult') == None: print("Empty message text") return {'fulfillmentText': 'unknown'} queryResult = some_json.get('queryResult') if queryResult.get('queryText') == None: print("Empty message text") return {'fulfillmentText': 'unknown'} newTextDescription = queryResult.get('queryText') print("received: ", newTextDescription) # Predict the classification of the text prediction = model.predict([newTextDescription]) # Return the result print("returning: ", prediction[0]) return {'fulfillmentText': prediction[0]}
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
async def processGoogleActionRequest(request): global model print("Received POST request on google interface") # Check if data provided if request.json == None: return json({"result", "No data in request"}) some_json = request.json if some_json.get('queryResult') == None: print("Empty message text") return json({'fulfillmentText': 'unknown'}) queryResult = some_json.get('queryResult') if queryResult.get('queryText') == None: print("Empty message text") return json({'fulfillmentText': 'unknown'}) newTextDescription = queryResult.get('queryText') print("received: ", newTextDescription) processedDescription = preProcess(newTextDescription) print("pre-processed: ", processedDescription) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose=True) model = Classifier.load("/root/combined_model_20181021") # Predict the classification of the text prediction = model.predict([processedDescription]) # Return the result print("returning: ", prediction[0]) return json({'fulfillmentText': prediction[0]})
def test_class_weights(self): # testing class weights train_sample = self.dataset.sample(n=self.n_sample * 3) valid_sample = self.dataset.sample(n=self.n_sample * 3) model = Classifier(**self.default_config()) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict(valid_sample.Text.values) recall = recall_score(valid_sample.Target.values, predictions, pos_label=1) model = Classifier(**self.default_config(class_weights={1: 100})) model.fit(train_sample.Text.values, train_sample.Target.values) predictions = model.predict(valid_sample.Text.values) new_recall = recall_score(valid_sample.Target.values, predictions, pos_label=1) self.assertTrue(new_recall >= recall) # test auto-inferred class weights function model = Classifier(**self.default_config(class_weights='log')) model.fit(train_sample.Text.values, train_sample.Target.values)
def _evaluate(self, session): try: with tf.Graph().as_default(): from finetune import Classifier model = Classifier(**self._config_to_finetune) if self._current_finetune.saver.variables: model.saver.variables = { k: v.copy() for k, v in self._current_finetune.saver.variables.items() if "global_step" not in k and "Adam" not in k } model.saver.fallback_ = { k: v for k, v in self._current_finetune.saver.fallback.items() if "global_step" not in k } train_x, train_y = self.train_data model.fit(train_x, train_y) test_x, test_y = self.test_data test_accuracy = np.mean(model.predict(test_x) == test_y) train_accuracy = np.mean(model.predict(train_x) == train_y) except IOError as e: traceback.print_exc(file=sys.stdout) test_accuracy = -1.0 train_accuracy = -1.0 global_step = session.run(tf.train.get_or_create_global_step()) directory = os.path.join(self._eval_dir, "..", "finetuning") if not os.path.exists(directory): os.makedirs(directory) summary_writer = writer_cache.FileWriterCache.get(directory) summary_proto = summary_pb2.Summary() summary_proto.value.add(tag="finetuning/{}_train_accurary".format( self._name), simple_value=float(train_accuracy)) summary_proto.value.add(tag="finetuning/{}_test_accurary".format( self._name), simple_value=float(test_accuracy)) summary_writer.add_summary(summary_proto, global_step) summary_writer.flush() self._timer.update_last_triggered_step(self._iter_count)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) model.fit(self.trainX, self.trainY, context=self.train_context) predictions = model.predict(self.trainX, context=self.train_context) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(self.trainX, context=self.train_context) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) (trainX, testX, trainY, _) = self.dataset trainY = [random.randint(0, 1) for _ in range(len(trainY))] model.fit(trainX, trainY) predictions = model.predict(testX) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(testX) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_cached_predict(self): """ Ensure second call to predict is faster than first """ model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text.values, train_sample.Target.values) with model.cached_predict(): start = time.time() model.predict(valid_sample.Text[:1].values) first = time.time() model.predict(valid_sample.Text[:1].values) second = time.time() first_prediction_time = first - start second_prediction_time = second - first self.assertLess(second_prediction_time, first_prediction_time / 2.0)
def test_auxiliary_classifier(self): """ Ensure model training does not error out Ensure model returns predictions """ (trainX, testX, trainY, _) = self.dataset trainY = [ random.randint(0, 1) for _ in range(len(trainY)) ] # random labels just to make sure there are no errors -> reasonable predictions tests are in sequence_label model = Classifier(**self.default_config()) model.fit(trainX, trainY) _ = model.predict(testX)
def test_reasonable_predictions(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(config=self.default_config()) n_per_class = (self.n_sample * 5) trX = ['cat'] * n_per_class + ['finance'] * n_per_class trY = copy(trX) teX = ['feline'] * n_per_class + ['investment'] * n_per_class teY = ['cat'] * n_per_class + ['finance'] * n_per_class model.fit(trX, trY) predY = model.predict(teX) self.assertEqual(accuracy_score(teY, predY), 1.00)
def test_reasonable_predictions_smaller_model(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(base_model=GPTModelSmall) n_per_class = (self.n_sample * 5) trX = ['cat'] * n_per_class + ['finance'] * n_per_class np.random.shuffle(trX) trY = copy(trX) teX = ['feline'] * n_per_class + ['investment'] * n_per_class teY = ['cat'] * n_per_class + ['finance'] * n_per_class model.fit(trX, trY) predY = model.predict(teX) self.assertEqual(accuracy_score(teY, predY), 1.00)
def test_explain(self): model = Classifier(**self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) explanations = model.explain(valid_sample.Text) normal_predictions = model.predict(valid_sample.Text) explanation_preds = [e["prediction"] for e in explanations] # check that the process of turning on explain does not change the preds self.assertEqual(explanation_preds, list(normal_predictions)) self.assertEqual(len(explanation_preds), len(train_sample.Text)) self.assertEqual(type(explanations[0]["token_ends"]), list) self.assertEqual(type(explanations[0]["token_starts"]), list) self.assertEqual(type(explanations[0]["explanation"]), dict) self.assertEqual(len(explanations[0]["token_starts"]), len(explanations[0]["explanation"][0])) self.assertEqual(len(explanations[0]["token_ends"]), len(explanations[0]["explanation"][0]))
def test_chunk_long_sequences(self): test_sequence = [ "This is a sentence to test chunk_long_sequences in classification. " * 20, "Another example so now there are two different classes in the test. " * 20, ] labels = ["a", "b"] model = Classifier() model.config.chunk_long_sequences = True model.config.max_length = 18 model.finetune(test_sequence * 10, labels * 10) predictions = model.predict(test_sequence * 10) probas = model.predict_proba(test_sequence * 10) self.assertEqual(len(predictions), 20) self.assertEqual(len(probas[0]), 2) np.testing.assert_almost_equal(np.sum(list(probas[0].values())), 1, decimal=4)
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions of the right type """ model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_reasonable_predictions(self): """ Ensure model converges to a reasonable solution for a trivial problem """ model = Classifier(**self.default_config(n_epochs=5)) n_duplicates = 5 trX = ( ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates + ["finance", "investment", "investing", "dividends", "financial"] * n_duplicates) trY = (['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2)) teX = ["furball", "fiduciary"] teY = ["cat"] + ["finance"] model.fit(trX, trY) predY = model.predict(teX) print(predY) self.assertEqual(accuracy_score(teY, predY), 1.00)
class StanfordSentimentTreebank(Dataset): def __init__(self, filename=None, **kwargs): super().__init__(filename=(filename or DATA_PATH), **kwargs) def md5(self): return CHECKSUM def download(self): """ Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME ) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))
""" Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(debugging_logs=True, interpolate_pos_embed=False, n_epochs=3, batch_size=2, lr_warmup=0.1, max_length=64, base_model=GPTModel) trainX, testX, trainY, testY = train_test_split(dataset.Text.values, dataset.Target.values, test_size=0.3, random_state=42) model.fit(trainX, trainY) preds = model.predict(testX) print(preds, testY) print(classification_report(testY, preds))
model.fit(trainX_res_list, trainY_res_list) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117" ) # Serialize the model to disk print("Model Saved") print("Starting testing") # model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start print("Predictions done") print("It took :" + str(duration) + " seconds") print("Evaluating accuracy") mainPredictions = [] for pred in predictions: mainPredictions.append(labelsMap[pred]) mainTestY = [] for testLabel in testY.tolist(): mainTestY.append(labelsMap[testLabel]) correctMain = 0 countMain = 0
def download(self): """ Download Stanford Sentiment Treebank to data directory """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) generic_download( url="https://s3.amazonaws.com/enso-data/SST-binary.csv", text_column="Text", target_column="Target", filename=SST_FILENAME) if __name__ == "__main__": # Train and evaluate on SST dataset = StanfordSentimentTreebank(nrows=1000).dataframe model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard') trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX, trainY) accuracy = np.mean(model.predict(testX) == testY) print('Test Accuracy: {:0.2f}'.format(accuracy))
"""GPT2imdb.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1_484wco-2YnrTKVJr5wN4qW4RDQIuKZD """ import pandas as pd import finetune url = 'https://raw.githubusercontent.com/BillGu19/Bass/master/name_genre_identifiers.csv' name_genre = pd.read_csv(url) name = name_genre['primaryName'] genre = name_genre['top genre'] #print(name) #print(genre) #print(name_genre) from finetune.base_models import BERT, BERTLarge, GPT2, GPT2Medium, GPT2Large, TextCNN, TCN, RoBERTa, DistilBERT from finetune import Classifier from finetune import LanguageModel #X = ['german shepherd', 'maine coon', 'persian', 'beagle'] #Y = ['dog', 'cat', 'cat', 'dog'] model = Classifier(base_model=GPT2) model.fit(name, genre) testX = ['Tom Cruise','Jamie Lee Curtis', 'Claire Danes', 'Geena Davis', 'Robert De Niro', 'John Denver', 'Johnny Depp', 'Leonardo DiCaprio', 'Clint Eastwood'] predictions= model.predict(testX) print(predictions)