def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions of the right type """ model = Comparison(**self.default_config()) n_samples = 10 model.fit([[ "Transformers was a terrible movie but a great model", "Transformers are a great model but a terrible movie" ]] * n_samples, ['yes'] * n_samples) test_data = [[ "Transformers was a terrible movie but a great model", "Transformers are a great model but a terrible movie" ]] predictions = model.predict(test_data) for prediction in predictions: self.assertIsInstance(prediction, (str, bytes)) probabilities = model.predict_proba(test_data) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_fit_predict(self): """ Ensure model training does not error out Ensure model returns predictions of the right type """ model = Comparison(**self.default_config()) n_samples = 10 model.fit(["Indico is the best"] * n_samples, ["Indico is the bestestestest"] * n_samples, ['yes'] * n_samples) predictions = model.predict(["Is indico the best?"], ["Indico is the bestestestest"]) for prediction in predictions: self.assertIsInstance(prediction, (str, bytes))
def test_comparison_auxiliary(self): """ Ensure model training does not error out Ensure model returns reasonable predictions """ model = Comparison(**self.default_config( chunk_long_sequences=False, max_length=50, batch_size=4)) trainX = [['i like apples', 'i like apples']] * 4 trainY = ['A', 'B', 'C', 'D'] train_context = [[self.train_context[i], self.train_context[j]] for i in [0, 1] for j in [0, 1]] model.fit(trainX, trainY, context=train_context) preds = model.predict(trainX, context=train_context)
def __init__(self, filename=None, **kwargs): super().__init__(filename=(filename or DATA_PATH), **kwargs) @property def md5(self): return CHECKSUM def download(self): """ Download quora duplicate questions dataset. """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) comparison_download( url="http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv", text_column1="question1", text_column2="question2", target_column="is_duplicate", filename=QUORA_SIMILARITY ) if __name__ == "__main__": # Train and evaluate on SST dataset = QuoraDuplicate(nrows=5000).dataframe model = Comparison(verbose=True, n_epochs=3) trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split(dataset.Text1, dataset.Text2, dataset.Target, test_size=0.3, random_state=42) model.fit(trainX1, trainX2, trainY) accuracy = np.mean(model.predict(testX1, testX2) == testY) class_balance = np.mean(testY) print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format(accuracy, class_balance))
return CHECKSUM def download(self): """ Download quora duplicate questions dataset. """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) comparison_download(url="https://s3.amazonaws.com/enso-data/Quora.csv", text_column1="Text1", text_column2="Text2", target_column="Target", filename=QUORA_SIMILARITY) if __name__ == "__main__": # Train and evaluate on SST dataset = QuoraDuplicate(nrows=5000).dataframe model = Comparison(n_epochs=1) trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split( dataset.Text1.values, dataset.Text2.values, dataset.Target.values, test_size=0.3, random_state=42) model.fit(list(zip(trainX1, trainX2)), trainY) accuracy = np.mean(model.predict(list(zip(testX1, testX2))) == testY) class_balance = np.mean(testY) print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format( accuracy, class_balance))
def __init__(self, filename=None, **kwargs): super().__init__(filename=(filename or DATA_PATH), **kwargs) @property def md5(self): return CHECKSUM def download(self): """ Download quora duplicate questions dataset. """ path = Path(self.filename) path.parent.mkdir(parents=True, exist_ok=True) comparison_download( url="http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv", text_column1="question1", text_column2="question2", target_column="is_duplicate", filename=QUORA_SIMILARITY ) if __name__ == "__main__": # Train and evaluate on SST dataset = QuoraDuplicate(nrows=5000).dataframe model = Comparison(verbose=True, n_epochs=1) trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split(dataset.Text1, dataset.Text2, dataset.Target, test_size=0.3, random_state=42) model.fit(list(zip(trainX1, trainX2)), trainY) accuracy = np.mean(model.predict(list(zip(testX1, testX2))) == testY) class_balance = np.mean(testY) print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format(accuracy, class_balance))