Ejemplo n.º 1
0
    def test_reasonable_predictions(self):
        model = Comparison(**self.default_config(n_epochs=3))

        # fake dataset generation
        animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"]
        numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]

        n_per = 50
        similar = []
        different = []
        for dataset in [animals, numbers]:
            for i in range(n_per // 2):
                similar.append([random.choice(dataset), random.choice(dataset)])
        for i in range(n_per):
            different.append([random.choice(animals), random.choice(numbers)])
        
        targets = np.asarray(["similar"] * len(similar) + ["different"] * len(different))
        data = similar + different

        x_tr, x_te, t_tr, t_te = train_test_split(data, targets, test_size=0.3)
        model.finetune(*list_transpose(x_tr), t_tr)

        predictions = model.predict(*list_transpose(x_te))
        accuracy = np.mean([pred == true for pred, true in zip(predictions, t_te)])
        naive_baseline = max(np.mean(targets == "similar"), np.mean(targets == "different"))
        self.assertGreater(accuracy, naive_baseline)
Ejemplo n.º 2
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        """

        model = Comparison(**self.default_config())
        n_samples = 10
        model.fit(["Indico is the best"] * n_samples, ["Indico is the bestestestest"] * n_samples, ['yes'] * n_samples)

        predictions = model.predict(["Is indico the best?"], ["Indico is the bestestestest"])
        for prediction in predictions:
            self.assertIsInstance(prediction, (str, bytes))
Ejemplo n.º 3
0
 def test_comparison_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns reasonable predictions
     """
     model = Comparison(**self.default_config(
         chunk_long_sequences=False, max_length=50, batch_size=4))
     trainX = [['i like apples', 'i like apples']] * 4
     trainY = ['A', 'B', 'C', 'D']
     train_context = [[self.train_context[i], self.train_context[j]]
                      for i in [0, 1] for j in [0, 1]]
     model.fit(trainX, trainY, context=train_context)
     preds = model.predict(trainX, context=train_context)
Ejemplo n.º 4
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        """

        model = Comparison(**self.default_config())
        n_samples = 10
        model.fit([[
            "Transformers was a terrible movie but a great model",
            "Transformers are a great model but a terrible movie"
        ]] * n_samples, ['yes'] * n_samples)

        test_data = [[
            "Transformers was a terrible movie but a great model",
            "Transformers are a great model but a terrible movie"
        ]]

        predictions = model.predict(test_data)
        for prediction in predictions:
            self.assertIsInstance(prediction, (str, bytes))

        probabilities = model.predict_proba(test_data)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Ejemplo n.º 5
0
    def __init__(self, filename=None, **kwargs):
        super().__init__(filename=(filename or DATA_PATH), **kwargs)

    @property
    def md5(self):
        return CHECKSUM

    def download(self):
        """
        Download quora duplicate questions dataset.
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        comparison_download(
            url="http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv",
            text_column1="question1",
            text_column2="question2",
            target_column="is_duplicate",
            filename=QUORA_SIMILARITY
        )

if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = QuoraDuplicate(nrows=5000).dataframe
    model = Comparison(verbose=True, n_epochs=3)
    trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split(dataset.Text1, dataset.Text2, dataset.Target, test_size=0.3, random_state=42)
    model.fit(trainX1, trainX2, trainY)
    accuracy = np.mean(model.predict(testX1, testX2) == testY)
    class_balance = np.mean(testY)
    print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format(accuracy, class_balance))
Ejemplo n.º 6
0
        return CHECKSUM

    def download(self):
        """
        Download quora duplicate questions dataset.
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        comparison_download(url="https://s3.amazonaws.com/enso-data/Quora.csv",
                            text_column1="Text1",
                            text_column2="Text2",
                            target_column="Target",
                            filename=QUORA_SIMILARITY)


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = QuoraDuplicate(nrows=5000).dataframe
    model = Comparison(n_epochs=1)
    trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split(
        dataset.Text1.values,
        dataset.Text2.values,
        dataset.Target.values,
        test_size=0.3,
        random_state=42)
    model.fit(list(zip(trainX1, trainX2)), trainY)
    accuracy = np.mean(model.predict(list(zip(testX1, testX2))) == testY)
    class_balance = np.mean(testY)
    print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format(
        accuracy, class_balance))
Ejemplo n.º 7
0
    def __init__(self, filename=None, **kwargs):
        super().__init__(filename=(filename or DATA_PATH), **kwargs)

    @property
    def md5(self):
        return CHECKSUM

    def download(self):
        """
        Download quora duplicate questions dataset.
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        comparison_download(
            url="http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv",
            text_column1="question1",
            text_column2="question2",
            target_column="is_duplicate",
            filename=QUORA_SIMILARITY
        )

if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = QuoraDuplicate(nrows=5000).dataframe
    model = Comparison(verbose=True, n_epochs=1)
    trainX1, testX1, trainX2, testX2, trainY, testY = train_test_split(dataset.Text1, dataset.Text2, dataset.Target, test_size=0.3, random_state=42)
    model.fit(list(zip(trainX1, trainX2)), trainY)
    accuracy = np.mean(model.predict(list(zip(testX1, testX2))) == testY)
    class_balance = np.mean(testY)
    print('Test Accuracy: {:0.2f} for a {:0.2f} class balance'.format(accuracy, class_balance))