def setUpClass(cls): cls._download_data() #dataset preparation cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10) path = os.path.join(os.path.dirname(__file__), "data", "testdata.json") with open(path, 'rt') as fp: cls.texts, cls.labels = json.load(fp) cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] #train and save sequence labeler for later use try: cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls)) except FileNotFoundError: cls.s = SequenceLabeler(**cls.default_seq_config(cls)) cls.s.fit(cls.texts * 10, cls.labels * 10) cls.s.save(cls.sequence_labeler_path) #train and save classifier for later use train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10) try: cls.cl = Classifier.load(cls.classifier_path) except FileNotFoundError: cls.cl = Classifier(**cls.default_config(cls)) cls.cl.fit(train_sample.Text, train_sample.Target) cls.cl.save(cls.classifier_path) if cls.do_comparison: #train and save comparison regressor for use cls.cr = ComparisonRegressor() n_per = 150 similar = [] different = [] for dataset in [cls.animals, cls.numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(cls.animals), random.choice(cls.numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42) try: cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls)) except FileNotFoundError: cls.cr = ComparisonRegressor(**cls.default_config(cls)) cls.cr.fit(cls.x_tr, cls.t_tr) cls.cr.save(cls.comparison_regressor_path)
def test_comparison_regressor_auxiliary(self): """ Ensure model training does not error out Ensure model returns reasonable predictions """ model = ComparisonRegressor(**self.default_config( chunk_long_sequences=False, max_length=50, batch_size=4)) trainX = [['i like apples', 'i like apples']] * 4 trainY = [0, .5, .5, 1] train_context = [[self.train_context[i], self.train_context[j]] for i in [0, 1] for j in [0, 1]] model.fit(trainX, trainY, context=train_context) preds = model.predict(trainX, context=train_context)
def test_reasonable_predictions(self): """ Ensure model training does not error out Ensure model returns predictions of the right type Test model loss at least outperforms naive baseline """ model = ComparisonRegressor(**self.default_config()) # fake dataset generation animals = [ "dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox" ] numbers = [ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen" ] n_per = 150 similar = [] different = [] for dataset in [animals, numbers]: for i in range(n_per // 2): similar.append( [random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(animals), random.choice(numbers)]) targets = np.asarray([1] * len(similar) + [0] * len(different)) data = similar + different x_tr, x_te, t_tr, t_te = train_test_split(data, targets, test_size=0.3, random_state=42) model.finetune(x_tr, t_tr) predictions = model.predict(x_te) mse = np.mean([(pred - true)**2 for pred, true in zip(predictions, t_te)]) naive_baseline = max(np.mean(targets == 1), np.mean(targets == 0)) naive_baseline_mse = np.mean([(naive_baseline - true)**2 for true in t_te]) self.assertIsInstance(predictions, np.ndarray) self.assertIsInstance(predictions[0], np.float32) self.assertGreater(naive_baseline_mse, mse)
def test_reasonable_predictions(self): """ Ensure model produces reasonable predictions after loading weights """ model = DeploymentModel(featurizer=self.base_model, **self.default_seq_config()) model.load_featurizer() #test same output as weights loaded with Classifier model valid_sample = self.classifier_dataset.sample(n=self.n_sample) model.load_custom_model(self.classifier_path) deployment_preds = model.predict_proba(valid_sample.Text.values) model.close() classifier_preds = self.cl.predict_proba(valid_sample.Text.values) for c_pred, d_pred in zip(classifier_preds, deployment_preds): self.assertTrue(list(c_pred.keys()) == list(d_pred.keys())) for c_pred_val, d_pred_val in zip(c_pred.values(), d_pred.values()): np.testing.assert_almost_equal(c_pred_val, d_pred_val, decimal=4) if self.do_comparison: #test same output as weights loaded with Comparison Regressor model model = DeploymentModel(featurizer=self.base_model, **self.default_seq_config()) model.load_featurizer() model.load_custom_model(self.comparison_regressor_path) deployment_preds = model.predict(self.x_te) model.close() compregressor = ComparisonRegressor.load(self.comparison_regressor_path, **self.default_comp_config()) compregressor_preds = compregressor.predict(self.x_te) for c_pred, d_pred in zip(compregressor_preds, deployment_preds): np.testing.assert_almost_equal(c_pred, d_pred, decimal=4)