def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with only text model.fit(train_sample.Text) # Save and reload check save_file = 'tests/saved-models/test-save-load' model.save(save_file) model = Classifier.load(save_file) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
async def classifyOpen311Complaint(request): global model # Check if data provided if request.json == None: return json({"result", "No data in request"}) # Check if we have a 311 'description' field if request.json.get('description') == None and request.json.get('descriptions') == None: return json({'service_code': 'unknown'}) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") if request.json.get('descriptions') != None: processedComplaints = list(map(lambda x: preProcess(x), request.json.get('descriptions'))) prediction = model.predict(processedComplaints).tolist() else: print("Doing simple prediction") prediction = model.predict([preProcess(request.json.get('description'))])[0] print("Prediction is: ", prediction) # If we have a service_code in the incoming request then we assume an Open311 message, # so we update the service_code and return the full message. Otherwise we just send # back a new message with the service_code only if request.json.get('service_code') == None: print("No service code provided, returning one") return json({'service_code': prediction}) else: print("Service_code was provided so updating it") request.json['service_code'] = prediction return json(request.json)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" save_file_fp16 = "tests/saved-models/test-save-load_fp16" config = self.default_config(save_adam_vars=False) model = Classifier(**config) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) # testing file size reduction options model.save(save_file) self.assertLess(os.stat(save_file).st_size, 500000000) # reducing floating point precision model.saver.save_dtype = np.float16 model.save(save_file_fp16) self.assertLess(os.stat(save_file_fp16).st_size, 260000000) model = Classifier.load(save_file_fp16) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def generate_GPT_feats(model_path, post_level=True): if post_level: df = pd.read_csv(PROCESSED_PATH / 'all_posts_data.csv') df = df[df.predict_me | (df.label.notnull())].loc[:, ['post_id', 'cleaned_body']] else: df = pd.read_csv(PROCESSED_PATH / 'sentences.csv') df = df.rename(columns={'body': 'cleaned_body'}) model = Classifier.load(model_path) texts_to_featurize = list(df.cleaned_body.astype(str)) features = model.featurize(texts_to_featurize) # generate a df with features as cols, with index as post_id GPT_embeddings = pd.DataFrame(features) GPT_embeddings.index = df.post_id if post_level: GPT_embeddings = GPT_embeddings.add_prefix('post_lvl-') else: GPT_embeddings = GPT_embeddings.add_prefix('sentence_lvl-') GPT_embeddings = flatten_cols( GPT_embeddings.groupby('post_id').agg(['mean', 'max', 'min'])) return GPT_embeddings
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" model = Classifier() lm_out = model.generate_text("The quick brown fox", 6) start_id = model.input_pipeline.text_encoder.start_token start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertNotIn(start_token, lm_out) # Non finetuned models do not use extra tokens train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertIn(start_token, lm_out.lower()) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn("{}Indico RULE".format(start_token).lower(), lm_out_2.lower()) # Both of these models use extra toks
def post(self): global model print("Received POST request on google interface") # if the classifier has not been loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") # check if the JSON description has been filled in some_json = request.get_json() if some_json.get('queryResult') == None: print("Empty message text") return {'fulfillmentText': 'unknown'} queryResult = some_json.get('queryResult') if queryResult.get('queryText') == None: print("Empty message text") return {'fulfillmentText': 'unknown'} newTextDescription = queryResult.get('queryText') print("received: ", newTextDescription) # Predict the classification of the text prediction = model.predict([newTextDescription]) # Return the result print("returning: ", prediction[0]) return {'fulfillmentText': prediction[0]}
def post(self): global model print("Received POST request on Open311 interface") # if the classifier has not been loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose = True) model = Classifier.load("/root/combined_model_20181021") # check if the JSON description has been filled in some_json = request.get_json() print("Received JSON: ", some_json) if some_json.get('description') == None: return {'service_code': 'unknown'} newTextDescription = some_json.get('description') print("received: ", newTextDescription) prediction = model.predict([newTextDescription]) # check if the input data also contained the service code and if so replace it # and return the original message if some_json.get('service_code') == None: # No service code so just return that print("No service code provided, returning one") return {'service_code': prediction[0]} else: print("Service_code was provided so updating it") some_json['service_code'] = prediction[0] return some_json
async def processGoogleActionRequest(request): global model print("Received POST request on google interface") # Check if data provided if request.json == None: return json({"result", "No data in request"}) some_json = request.json if some_json.get('queryResult') == None: print("Empty message text") return json({'fulfillmentText': 'unknown'}) queryResult = some_json.get('queryResult') if queryResult.get('queryText') == None: print("Empty message text") return json({'fulfillmentText': 'unknown'}) newTextDescription = queryResult.get('queryText') print("received: ", newTextDescription) processedDescription = preProcess(newTextDescription) print("pre-processed: ", processedDescription) # If the model is not already loaded then load it if model == None: model = Classifier(max_length=512, val_interval=3000, verbose=True) model = Classifier.load("/root/combined_model_20181021") # Predict the classification of the text prediction = model.predict([processedDescription]) # Return the result print("returning: ", prediction[0]) return json({'fulfillmentText': prediction[0]})
def setUpClass(cls): cls._download_data() #dataset preparation cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10) path = os.path.join(os.path.dirname(__file__), "data", "testdata.json") with open(path, 'rt') as fp: cls.texts, cls.labels = json.load(fp) cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] #train and save sequence labeler for later use try: cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls)) except FileNotFoundError: cls.s = SequenceLabeler(**cls.default_seq_config(cls)) cls.s.fit(cls.texts * 10, cls.labels * 10) cls.s.save(cls.sequence_labeler_path) #train and save classifier for later use train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10) try: cls.cl = Classifier.load(cls.classifier_path) except FileNotFoundError: cls.cl = Classifier(**cls.default_config(cls)) cls.cl.fit(train_sample.Text, train_sample.Target) cls.cl.save(cls.classifier_path) if cls.do_comparison: #train and save comparison regressor for use cls.cr = ComparisonRegressor() n_per = 150 similar = [] different = [] for dataset in [cls.animals, cls.numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(cls.animals), random.choice(cls.numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42) try: cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls)) except FileNotFoundError: cls.cr = ComparisonRegressor(**cls.default_config(cls)) cls.cr.fit(cls.x_tr, cls.t_tr) cls.cr.save(cls.comparison_regressor_path)
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(verbose=False) train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) model.fit(self.trainX, self.trainY, context=self.train_context) predictions = model.predict(self.trainX, context=self.train_context) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(self.trainX, context=self.train_context) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) (trainX, testX, trainY, _) = self.dataset trainY = [random.randint(0, 1) for _ in range(len(trainY))] model.fit(trainX, trainY) predictions = model.predict(testX) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(testX) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])