def setUpClass(cls):
        cls._download_data()
        
        #dataset preparation
        cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10)

        path = os.path.join(os.path.dirname(__file__), "data", "testdata.json")
        with open(path, 'rt') as fp:
            cls.texts, cls.labels = json.load(fp)

        cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"]
        cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
        
        #train and save sequence labeler for later use
        try:
            cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls))
        except FileNotFoundError:
            cls.s = SequenceLabeler(**cls.default_seq_config(cls))
            cls.s.fit(cls.texts * 10, cls.labels * 10)
            cls.s.save(cls.sequence_labeler_path)
        
        #train and save classifier for later use
        train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10)
        try:
            cls.cl = Classifier.load(cls.classifier_path)
        except FileNotFoundError:
            cls.cl = Classifier(**cls.default_config(cls))
            cls.cl.fit(train_sample.Text, train_sample.Target)
            cls.cl.save(cls.classifier_path)

        if cls.do_comparison:
            #train and save comparison regressor for use
            cls.cr = ComparisonRegressor()
    
            n_per = 150
            similar = []
            different = []
            for dataset in [cls.animals, cls.numbers]:
                for i in range(n_per // 2):
                    similar.append([random.choice(dataset), random.choice(dataset)])
            for i in range(n_per):
                different.append([random.choice(cls.animals), random.choice(cls.numbers)])

            targets = np.asarray([1] * len(similar) + [0] * len(different))
            data = similar + different

            cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42)
            
            try:
                cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls))
            except FileNotFoundError:
                cls.cr = ComparisonRegressor(**cls.default_config(cls))
                cls.cr.fit(cls.x_tr, cls.t_tr)
                cls.cr.save(cls.comparison_regressor_path)
Exemple #2
0
 def test_comparison_regressor_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns reasonable predictions
     """
     model = ComparisonRegressor(**self.default_config(
         chunk_long_sequences=False, max_length=50, batch_size=4))
     trainX = [['i like apples', 'i like apples']] * 4
     trainY = [0, .5, .5, 1]
     train_context = [[self.train_context[i], self.train_context[j]]
                      for i in [0, 1] for j in [0, 1]]
     model.fit(trainX, trainY, context=train_context)
     preds = model.predict(trainX, context=train_context)
    def test_reasonable_predictions(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        Test model loss at least outperforms naive baseline
        """
        model = ComparisonRegressor(**self.default_config())

        # fake dataset generation
        animals = [
            "dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken",
            "guinea pig", "donkey", "turkey", "duck", "camel", "goose",
            "llama", "rabbit", "fox"
        ]
        numbers = [
            "one", "two", "three", "four", "five", "six", "seven", "eight",
            "nine", "ten", "eleven", "twelve", "thirteen", "fourteen",
            "fifteen", "sixteen"
        ]

        n_per = 150
        similar = []
        different = []
        for dataset in [animals, numbers]:
            for i in range(n_per // 2):
                similar.append(
                    [random.choice(dataset),
                     random.choice(dataset)])
        for i in range(n_per):
            different.append([random.choice(animals), random.choice(numbers)])

        targets = np.asarray([1] * len(similar) + [0] * len(different))
        data = similar + different

        x_tr, x_te, t_tr, t_te = train_test_split(data,
                                                  targets,
                                                  test_size=0.3,
                                                  random_state=42)
        model.finetune(x_tr, t_tr)

        predictions = model.predict(x_te)
        mse = np.mean([(pred - true)**2
                       for pred, true in zip(predictions, t_te)])
        naive_baseline = max(np.mean(targets == 1), np.mean(targets == 0))
        naive_baseline_mse = np.mean([(naive_baseline - true)**2
                                      for true in t_te])
        self.assertIsInstance(predictions, np.ndarray)
        self.assertIsInstance(predictions[0], np.float32)
        self.assertGreater(naive_baseline_mse, mse)
 def test_reasonable_predictions(self):
     """
     Ensure model produces reasonable predictions after loading weights
     """
     model = DeploymentModel(featurizer=self.base_model, **self.default_seq_config())
     model.load_featurizer()
     
     #test same output as weights loaded with Classifier model
     valid_sample = self.classifier_dataset.sample(n=self.n_sample)
     model.load_custom_model(self.classifier_path)
     deployment_preds = model.predict_proba(valid_sample.Text.values)
     model.close()
     classifier_preds = self.cl.predict_proba(valid_sample.Text.values)
     
     for c_pred, d_pred in zip(classifier_preds, deployment_preds):
         self.assertTrue(list(c_pred.keys()) == list(d_pred.keys()))
         for c_pred_val, d_pred_val in zip(c_pred.values(), d_pred.values()):
             np.testing.assert_almost_equal(c_pred_val, d_pred_val, decimal=4)
     
     if self.do_comparison:
         #test same output as weights loaded with Comparison Regressor model
         model = DeploymentModel(featurizer=self.base_model, **self.default_seq_config())
         model.load_featurizer()
         model.load_custom_model(self.comparison_regressor_path)
         deployment_preds = model.predict(self.x_te)
         model.close()
         compregressor = ComparisonRegressor.load(self.comparison_regressor_path,  **self.default_comp_config())
         compregressor_preds = compregressor.predict(self.x_te)
         for c_pred, d_pred in zip(compregressor_preds, deployment_preds):
             np.testing.assert_almost_equal(c_pred, d_pred, decimal=4)