Beispiel #1
0
    def run(self, params, batcher):
        sick_embed = {"train": {}, "dev": {}, "test": {}}
        bsize = params.batch_size

        for key in self.sick_data:
            logging.info("Computing embedding for {0}".format(key))
            # Sort to reduce padding
            sorted_corpus = sorted(
                zip(
                    self.sick_data[key]["X_A"],
                    self.sick_data[key]["X_B"],
                    self.sick_data[key]["y"],
                ),
                key=lambda z: (len(z[0]), len(z[1]), z[2]),
            )

            self.sick_data[key]["X_A"] = [x for (x, y, z) in sorted_corpus]
            self.sick_data[key]["X_B"] = [y for (x, y, z) in sorted_corpus]
            self.sick_data[key]["y"] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ["X_A", "X_B"]:
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]["y"]), bsize):
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)
                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(
                    sick_embed[key][txt_type])
            sick_embed[key]["y"] = np.array(self.sick_data[key]["y"])
            logging.info("Computed {0} embeddings".format(key))

        # Train
        trainA = sick_embed["train"]["X_A"]
        trainB = sick_embed["train"]["X_B"]
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = self.encode_labels(self.sick_data["train"]["y"])

        # Dev
        devA = sick_embed["dev"]["X_A"]
        devB = sick_embed["dev"]["X_B"]
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = self.encode_labels(self.sick_data["dev"]["y"])

        # Test
        testA = sick_embed["test"]["X_A"]
        testB = sick_embed["test"]["X_B"]
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = self.encode_labels(self.sick_data["test"]["y"])

        config = {"seed": self.seed, "nclasses": 5}
        clf = RelatednessPytorch(
            train={
                "X": trainF,
                "y": trainY
            },
            valid={
                "X": devF,
                "y": devY
            },
            test={
                "X": testF,
                "y": testY
            },
            devscores=self.sick_data["dev"]["y"],
            config=config,
        )

        devpr, yhat = clf.run()

        pr = pearsonr(yhat, self.sick_data["test"]["y"])[0]
        sr = spearmanr(yhat, self.sick_data["test"]["y"])[0]
        se = mean_squared_error(yhat, self.sick_data["test"]["y"])
        logging.debug("Dev : Pearson {0}".format(devpr))
        logging.debug("Test : Pearson {0} Spearman {1} MSE {2} \
                       for SICK Relatedness\n".format(pr, sr, se))

        # cast to list so it's json serializable
        yhat = yhat.tolist()
        return {
            "devpearson": devpr,
            "pearson": pr,
            "spearman": sr,
            "mse": se,
            "yhat": yhat,
            "ndev": len(devA),
            "ntest": len(testA),
        }
Beispiel #2
0
    def run(self, params, batcher):
        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sick_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
                                       self.sick_data[key]['X_B'],
                                       self.sick_data[key]['y']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))

            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ['X_A', 'X_B']:
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]['y']), bsize):
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)
                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(
                    sick_embed[key][txt_type])
            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainA = sick_embed['train']['X_A']
        trainB = sick_embed['train']['X_B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = self.encode_labels(self.sick_data['train']['y'])

        # Dev
        devA = sick_embed['dev']['X_A']
        devB = sick_embed['dev']['X_B']
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = self.encode_labels(self.sick_data['dev']['y'])

        # Test
        testA = sick_embed['test']['X_A']
        testB = sick_embed['test']['X_B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = self.encode_labels(self.sick_data['test']['y'])

        config = {'seed': self.seed, 'nclasses': 5}
        clf = RelatednessPytorch(train={
            'X': trainF,
            'y': trainY
        },
                                 valid={
                                     'X': devF,
                                     'y': devY
                                 },
                                 test={
                                     'X': testF,
                                     'y': testY
                                 },
                                 devscores=self.sick_data['dev']['y'],
                                 config=config)

        devpr, yhat = clf.run()

        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(yhat, self.sick_data['test']['y'])
        logging.debug('Dev : Pearson {0}'.format(devpr))
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                       for SICK Relatedness\n'.format(pr, sr, se))

        return {
            'devpearson': devpr,
            'pearson': pr,
            'spearman': sr,
            'mse': se,
            'yhat': yhat,
            'ndev': len(devA),
            'ntest': len(testA)
        }
Beispiel #3
0
    def run(self, params, batcher):
        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size
        self.adversarialFunc = params.adversarialFunc
        self.params = params
        sick_advs = {'train': {}, 'dev': {}, 'test': {}}


        for key in self.sick_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_corpus = zip(self.sick_data[key]['X_A'],
                                       self.sick_data[key]['X_B'],
                                       self.sick_data[key]['y'])
                                   # key=lambda z: (len(z[0]), len(z[1]), z[2])




            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]


            for txt_type in ['X_A', 'X_B']:
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]['y']), bsize):
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)



                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))



        # Train
        trainA = sick_embed['train']['X_A']
        trainB = sick_embed['train']['X_B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = self.encode_labels(self.sick_data['train']['y'])


        # Dev
        devA = sick_embed['dev']['X_A']
        devB = sick_embed['dev']['X_B']
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = self.encode_labels(self.sick_data['dev']['y'])

        # Test
        testA = sick_embed['test']['X_A']
        testB = sick_embed['test']['X_B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = self.encode_labels(self.sick_data['test']['y'])

        config = {'seed': self.seed, 'nclasses': 5, 'model_name': params.model_name, 'task_name': self.task_name}
        clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
                                 valid={'X': devF, 'y': devY},
                                 test={'X': testF, 'y': testY},
                                 devscores=self.sick_data['dev']['y'],
                                 config=config)


        #################################################################################################################


        # devpr, test_yhat = clf.run()
        # print("test yhat shape:")
        # print(test_yhat.shape)
        # pr = pearsonr(test_yhat, self.sick_data['test']['y'])[0]
        # sr = spearmanr(test_yhat, self.sick_data['test']['y'])[0]
        # pr = 0 if pr != pr else pr
        # sr = 0 if sr != sr else sr
        # se = mean_squared_error(test_yhat, self.sick_data['test']['y'])
        # logging.debug('Dev : Pearson {0}'.format(devpr))
        # logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
        #                        for SICK Relatedness\n'.format(pr, sr, se))

        #################################################################################################################


        test_yhat = clf.predict(testF)
        print("test yhat shape:")
        print(test_yhat.shape)
        pr = pearsonr(test_yhat, self.sick_data['test']['y'])[0]
        sr = spearmanr(test_yhat, self.sick_data['test']['y'])[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(test_yhat, self.sick_data['test']['y'])
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                       for SICK Relatedness\n'.format(pr, sr, se))

        train_yhat = clf.predict(trainF)
        print("train yhat shape:")
        print(train_yhat.shape)
        pr = pearsonr(train_yhat, self.sick_data['train']['y'])[0]
        sr = spearmanr(train_yhat, self.sick_data['train']['y'])[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(train_yhat, self.sick_data['train']['y'])
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                               for SICK Relatedness\n'.format(pr, sr, se))

        dev_yhat = clf.predict(devF)
        print("dev yhat shape:")
        print(dev_yhat.shape)
        pr = pearsonr(dev_yhat, self.sick_data['dev']['y'])[0]
        sr = spearmanr(dev_yhat, self.sick_data['dev']['y'])[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(dev_yhat, self.sick_data['dev']['y'])
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                                      for SICK Relatedness\n'.format(pr, sr, se))

        y_hat = {'train': {}, 'dev': {}, 'test': {}, 'adv_train': {}, 'adv_dev': {}, 'adv_test': {}}
        y_hat['train'] = train_yhat
        y_hat['test'] = test_yhat
        y_hat['dev'] = dev_yhat


        for key in self.sick_data:
            sick_advs[key]['X_A'] = []
            sick_advs[key]['X_A_orig'] = []
            sick_advs[key]['X_B'] = []
            sick_advs[key]['y'] = []
            sick_advs[key]['sents'] = []
            sick_advs[key]['predicted_y'] = []
            for ii in range(0, len(self.sick_data[key]['X_A']), bsize):

                batch = self.sick_data[key]['X_A'][ii:ii + bsize]
                labels = self.sick_data[key]['y'][ii:ii + bsize]

                embeddings = sick_embed[key]['X_A'][ii:ii + bsize]
                adv_samples, _, new_sentences = self.adversarialFunc(params, batch, labels, embeddings)
                # print(batch[0])
                print("Computing %dth embedding: batch_size %d" % (ii, len(batch)))
                # print(len(adv_samples), bsize)
                if ii + bsize < len(self.sick_data[key]['X_A']):
                    assert len(adv_samples) == bsize

                for sent_adversaries, j in zip(adv_samples, range(len(adv_samples))):
                    b_adversaries = []
                    a_adversaries = []
                    repeated_labels = []
                    predicted_y = []
                    for adv_sample in sent_adversaries:
                        b_adversaries.append(sick_embed[key]['X_B'][ii + j])
                        repeated_labels.append(self.sick_data[key]['y'][ii + j])
                        a_adversaries.append(sick_embed[key]['X_A'][ii + j])
                        predicted_y.append(y_hat[key][ii+j])
                    sick_advs[key]['X_A'].append(sent_adversaries)
                    sick_advs[key]['X_A_orig'].append(a_adversaries)
                    sick_advs[key]['X_B'].append(b_adversaries)
                    sick_advs[key]['y'].append(repeated_labels)
                    sick_advs[key]['sents'].append(new_sentences[j])
                    sick_advs[key]['predicted_y'].append(predicted_y)

            print("no of examples for key:%s:%d,%d" % (key, len(sick_advs[key]['X_A']), len(sick_advs[key]['X_B'])))




        advs_trainA = []
        advs_orig_trainA = []
        advs_trainB = []
        advs_trainY = []
        advs_train_predictedY = []

        for a_advs, b_advs, y_advs, orig_advs, orig_predicted_y in \
                zip(sick_advs['train']['X_A'], sick_advs['train']['X_B'], sick_advs['train']['y'], sick_advs['train']['X_A_orig'], sick_advs['train']['predicted_y']) :
            advs_trainA.extend(a_advs)
            advs_trainB.extend(b_advs)
            advs_trainY.extend(y_advs)
            advs_orig_trainA.extend(orig_advs)
            advs_train_predictedY.extend(orig_predicted_y)
        advs_trainA = np.array(advs_trainA)
        advs_trainB = np.array(advs_trainB)
        advs_trainY = np.array(advs_trainY)
        advs_orig_trainA = np.array(advs_orig_trainA)
        advs_train_predictedY = np.array(advs_train_predictedY)

        print("train adversaries length:%d,%d,%d " % (len(advs_trainA), len(advs_trainB), len(advs_trainY)))
        advs_trainF = np.c_[np.abs(advs_trainA - advs_trainB), advs_trainA * advs_trainB]
        advs_trainY_non_encoded = list(advs_trainY)
        advs_trainY = self.encode_labels(advs_trainY)




        advs_testA = []
        advs_orig_testA = []
        advs_testB = []
        advs_testY = []
        advs_test_predictedY = []
        advs_test_sent_id = []
        sent_id = 0
        for a_advs, b_advs, y_advs, orig_advs, orig_predicted_y in zip(sick_advs['test']['X_A'], sick_advs['test']['X_B'],
                                          sick_advs['test']['y'], sick_advs['test']['X_A_orig'], sick_advs['test']['predicted_y']):
            advs_testA.extend(a_advs)
            advs_testB.extend(b_advs)
            advs_testY.extend(y_advs)
            advs_orig_testA.extend(orig_advs)
            advs_test_predictedY.extend(orig_predicted_y)
            advs_test_sent_id.extend([sent_id]*len(a_advs))
            sent_id+=1

        advs_testA = np.array(advs_testA)
        advs_testB = np.array(advs_testB)
        advs_testY = np.array(advs_testY)
        advs_orig_testA = np.array(advs_orig_testA)
        advs_test_predictedY = np.array(advs_test_predictedY)

        print("test adversaries length:%d,%d,%d " % (len(advs_testA), len(advs_testB), len(advs_testY)))
        advs_testF = np.c_[np.abs(advs_testA - advs_testB), advs_testA * advs_testB]
        advs_testY_non_encoded = list(advs_testY)
        advs_testY = self.encode_labels(advs_testY)



        advs_devA = []
        advs_orig_devA = []
        advs_devB = []
        advs_devY = []
        advs_dev_predictedY = []
        advs_dev_sent_id = []
        sent_id = 0
        for a_advs, b_advs, y_advs, orig_advs, orig_predicted_y in zip(sick_advs['dev']['X_A'],
                                                                       sick_advs['dev']['X_B'],
                                                                       sick_advs['dev']['y'],
                                                                       sick_advs['dev']['X_A_orig'],
                                                                       sick_advs['dev']['predicted_y']):
            advs_devA.extend(a_advs)
            advs_devB.extend(b_advs)
            advs_devY.extend(y_advs)
            advs_orig_devA.extend(orig_advs)
            advs_dev_predictedY.extend(orig_predicted_y)
            advs_dev_sent_id.extend([sent_id]*len(a_advs))
            sent_id += 1

        advs_devA = np.array(advs_devA)
        advs_devB = np.array(advs_devB)
        advs_devY = np.array(advs_devY)
        advs_orig_devA = np.array(advs_orig_devA)
        advs_dev_predictedY = np.array(advs_dev_predictedY)

        print("dev adversaries length:%d,%d,%d " % (len(advs_devA), len(advs_devB), len(advs_devY)))
        advs_devF = np.c_[np.abs(advs_devA - advs_devB), advs_devA * advs_devB]
        advs_devY_non_encoded = list(advs_devY)
        advs_devY = self.encode_labels(advs_devY)







        advs_train_yhat = clf.predict(advs_trainF)

        print("advs train yhat shape:")
        print(advs_train_yhat.shape)
        pr = pearsonr(advs_train_yhat, advs_trainY_non_encoded)[0]
        sr = spearmanr(advs_train_yhat, advs_trainY_non_encoded)[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(advs_train_yhat, advs_trainY_non_encoded)
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                               for SICK Relatedness\n'.format(pr, sr, se))




        advs_test_yhat = clf.predict(advs_testF)

        print("advs test yhat shape:")
        print(advs_test_yhat.shape)
        pr = pearsonr(advs_test_yhat, advs_testY_non_encoded)[0]
        sr = spearmanr(advs_test_yhat, advs_testY_non_encoded)[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(advs_test_yhat, advs_testY_non_encoded)
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                       for SICK Relatedness\n'.format(pr, sr, se))

        advs_dev_yhat = clf.predict(advs_devF)

        print("advs dev yhat shape:")
        print(advs_dev_yhat.shape)
        pr = pearsonr(advs_dev_yhat, advs_devY_non_encoded)[0]
        sr = spearmanr(advs_dev_yhat, advs_devY_non_encoded)[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(advs_dev_yhat, advs_devY_non_encoded)
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                               for SICK Relatedness\n'.format(pr, sr, se))



        train_x = {'advs_x': advs_trainA, 'orig_x': advs_orig_trainA, 'y_hat' : advs_train_yhat, 'predicted_y' : advs_train_predictedY}

        dev_x = {'advs_x': advs_devA, 'orig_x': advs_orig_devA, 'y_hat': advs_dev_yhat,
                   'predicted_y': advs_dev_predictedY}

        test_x = {'advs_x': advs_testA, 'orig_x': advs_orig_testA, 'y_hat': advs_test_yhat,
                 'predicted_y': advs_test_predictedY}

        dev_f = np.c_[
            np.abs(dev_x['advs_x'] - dev_x['orig_x']), dev_x['advs_x'] * dev_x['orig_x'], dev_x['y_hat']]
        test_f = np.c_[
            np.abs(test_x['advs_x'] - test_x['orig_x']), test_x['advs_x'] * test_x['orig_x'], test_x['y_hat']]


        train_y = advs_train_predictedY
        dev_y = advs_dev_predictedY
        test_y = advs_test_predictedY

        self.train_y_pred_model(train_x, dev_x, train_y, dev_y)


        dev_preds = self.predict_proba(dev_f)
        dev_se = mean_squared_error(dev_preds, dev_y)
        print("dev squared error: ", dev_se)

        test_preds = self.predict_proba(test_f)
        print(len(test_f), len(test_preds))
        test_se = mean_squared_error(test_preds, test_y)

        print("test squared error: ", test_se)


        key = 'test'
        test_max_sents = max(advs_test_sent_id)
        new_preds = list(self.sick_data[key]['y'])

        assert len(test_preds) == len(advs_testY_non_encoded)
        print(len(sick_advs[key]['y']), len(advs_testY_non_encoded))
        for i in range(len(test_preds)):
            sent_no = advs_test_sent_id[i]
            #
            if(abs(test_preds[i] - advs_testY_non_encoded[i]) > abs(new_preds[sent_no] - advs_testY_non_encoded[i]) ):
                new_preds[sent_no] = test_preds[i]

        new_preds = np.array(new_preds).reshape((len(new_preds)))

        print("final test yhat shape:")
        print(new_preds.shape)
        pr = pearsonr(new_preds, self.sick_data['test']['y'])[0]
        sr = spearmanr(new_preds, self.sick_data['test']['y'])[0]
        pr = 0 if pr != pr else pr
        sr = 0 if sr != sr else sr
        se = mean_squared_error(new_preds, self.sick_data['test']['y'])
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                                               for SICK Relatedness\n'.format(pr, sr, se))
        test_yhat = test_preds
        devpr = -1

        return {'devpearson': devpr, 'pearson': pr, 'spearman': sr, 'mse': se,
                'yhat': test_yhat, 'ndev': len(devA), 'ntest': len(advs_trainA)}