Esempio n. 1
0
    def run(self, params, batcher):
        embed = {'train': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_data = sorted(zip(self.data[key]['X'],
                                     self.data[key]['y']),
                                 key=lambda z: (len(z[0]), z[1]))
            self.data[key]['X'], self.data[key]['y'] = map(list, zip(*sorted_data))

            embed[key]['X'] = []
            for ii in range(0, len(self.data[key]['y']), bsize):
                batch = self.data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                embed[key]['X'].append(embeddings)
            embed[key]['X'] = np.vstack(embed[key]['X'])
            embed[key]['y'] = np.array(self.data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
                             'usepytorch': params.usepytorch,
                             'classifier': params.classifier,
                             'kfold': params.kfold}

        clf = KFoldClassifier(embed['train'], embed['test'], config_classifier)

        devacc, testacc, _ = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            FormalityJa classification\n'.format(devacc, testacc))

        return {'devacc': devacc, 'acc': testacc,
                'ntest': len(embed['test']['X'])}
Esempio n. 2
0
    def run(self, params, batcher):
        train_embeddings, test_embeddings = [], []

        # Sort to reduce padding
        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
                                     key=lambda z: (len(z[0]), z[1]))
        train_samples = [x for (x, y) in sorted_corpus_train]
        train_labels = [y for (x, y) in sorted_corpus_train]

        sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
                                    key=lambda z: (len(z[0]), z[1]))
        test_samples = [x for (x, y) in sorted_corpus_test]
        test_labels = [y for (x, y) in sorted_corpus_test]

        # Get train embeddings
        for ii in range(0, len(train_labels), params.batch_size):
            batch = train_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            train_embeddings.append(embeddings)
        train_embeddings = np.vstack(train_embeddings)
        logging.info('Computed train embeddings')

        # Get test embeddings
        for ii in range(0, len(test_labels), params.batch_size):
            batch = test_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            test_embeddings.append(embeddings)
        test_embeddings = np.vstack(test_embeddings)
        logging.info('Computed test embeddings')

        config_classifier = {
            'nclasses': 6,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'kfold': params.kfold
        }
        clf = KFoldClassifier(
            {
                'X': train_embeddings,
                'y': np.array(train_labels)
            }, {
                'X': test_embeddings,
                'y': np.array(test_labels)
            }, config_classifier)
        classifier, devacc, testacc, _ = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} \
            for TREC\n'.format(devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.train['X']),
            'ntest': len(self.test['X']),
            'classifier': classifier,
            'X_train': train_embeddings,
            #'Y': np.array(train_labels + test_labels)
            'X_test': test_embeddings,
            'text': train_samples
        }
Esempio n. 3
0
    def single_run(self, params, batcher, train_X, train_y, test_X, test_y,
                   field):
        # batcher is the algorithm
        train_embeddings, test_embeddings = [], []

        # Sort to reduce padding
        sorted_corpus_train = sorted(zip(train_X, train_y),
                                     key=lambda z: (len(z[0]), z[1]))
        train_samples = [x for (x, y) in sorted_corpus_train]
        train_labels = [y for (x, y) in sorted_corpus_train]

        sorted_corpus_test = sorted(zip(test_X, test_y),
                                    key=lambda z: (len(z[0]), z[1]))
        test_samples = [x for (x, y) in sorted_corpus_test]
        test_labels = [y for (x, y) in sorted_corpus_test]

        # Get train embeddings
        for ii in range(0, len(train_labels), params.batch_size):
            batch = train_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            train_embeddings.append(embeddings)
        train_embeddings = np.vstack(train_embeddings)
        logging.info('Computed train embeddings')

        # Get test embeddings
        for ii in range(0, len(test_labels), params.batch_size):
            batch = test_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            test_embeddings.append(embeddings)
        test_embeddings = np.vstack(test_embeddings)
        logging.info('Computed test embeddings')

        config_classifier = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'nhid': params.nhid,
            'kfold': params.kfold
        }
        clf = KFoldClassifier(
            {
                'X': train_embeddings,
                'y': np.array(train_labels)
            }, {
                'X': test_embeddings,
                'y': np.array(test_labels)
            }, config_classifier)
        devacc, testacc, _ = clf.run()
        logging.debug('\n' + field + ' Dev acc : {0} Test acc : {1} \
            for ABSA_SP\n'.format(devacc, testacc))
        return {
            '{} devacc'.format(field): devacc,
            '{} acc'.format(field): testacc,
            '{} ndev'.format(field): len(train_X),
            '{} ntest'.format(field): len(test_X)
        }
Esempio n. 4
0
    def run(self, params, batcher):
        train_embeddings, test_embeddings = [], []

        train_samples = self.train['X']
        train_labels = self.train['y']

        test_samples = self.test['X']
        test_labels = self.test['y']

        # Get train embeddings
        for ii in range(0, len(train_labels), params.batch_size):
            batch = train_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            train_embeddings.append(embeddings)

        train_embeddings = np.vstack(train_embeddings)
        logging.info('Computed train embeddings')

        # Get test embeddings
        for ii in range(0, len(test_labels), params.batch_size):
            batch = test_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            test_embeddings.append(embeddings)
            # for e in embeddings:
            #    test_embeddings.append(e)
        test_embeddings = np.vstack(test_embeddings)
        logging.info('Computed test embeddings')

        config_classifier = {
            'nclasses': 27,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'kfold': params.kfold
        }
        clf = KFoldClassifier(
            {
                'X': train_embeddings,
                'y': np.array(train_labels)
            }, {
                'X': test_embeddings,
                'y': np.array(test_labels)
            }, config_classifier)
        devacc, testacc, _ = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} \
            for ' + self.evalType + " ".format(devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.train['X']),
            'ntest': len(self.test['X'])
        }
Esempio n. 5
0
    def run(self, params, batcher):
        mrpc_embed = {'train': {}, 'test': {}}

        for key in self.mrpc_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            text_data = {}
            sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
                                       self.mrpc_data[key]['X_B'],
                                       self.mrpc_data[key]['y']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))

            text_data['A'] = [x for (x, y, z) in sorted_corpus]
            text_data['B'] = [y for (x, y, z) in sorted_corpus]
            text_data['y'] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ['A', 'B']:
                mrpc_embed[key][txt_type] = batcher(params, text_data[txt_type], key)
#                mrpc_embed[key][txt_type] = []
#                for ii in range(0, len(text_data['y']), params.batch_size):
#                    batch = text_data[txt_type][ii:ii + params.batch_size]
#                    embeddings = batcher(params, batch)
#                    mrpc_embed[key][txt_type].append(embeddings)
#                mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
            mrpc_embed[key]['y'] = np.array(text_data['y'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainA = mrpc_embed['train']['A']
        trainB = mrpc_embed['train']['B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = mrpc_embed['train']['y']

        # Test
        testA = mrpc_embed['test']['A']
        testB = mrpc_embed['test']['B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = mrpc_embed['test']['y']

        config = {'nclasses': 2, 'seed': self.seed,
                  'usepytorch': params.usepytorch,
                  'classifier': params.classifier,
                  'nhid': params.nhid, 'kfold': params.kfold}
        clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
                              test={'X': testF, 'y': testY}, config=config)

        devacc, testacc, yhat = clf.run()
        testf1 = round(100*f1_score(testY, yhat), 2)
        logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
                      .format(devacc, testacc, testf1))
        return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
                'ndev': len(trainA), 'ntest': len(testA)}
Esempio n. 6
0
    def run(self, params, batcher):
        mrpc_embed = {'train': {}, 'test': {}}

        if (params.train is not None and params.train == False):
            mrpc_embed = {'train': {}, 'test': {}}
        else:
            mrpc_embed = {'train': {}, 'dev': {}, 'test': {}}
        test_file_x_a = 'embeddings/testx_a_' + params.model_name + "_mrpc.csv"
        test_file_x_b = 'embeddings/testx_b_' + params.model_name + "_mrpc.csv"
        test_file_y = 'embeddings/testy_' + params.model_name + "_mrpc.csv"

        train_file_x_a = 'embeddings/trainx_a' + params.model_name + "_mrpc.csv"
        train_file_x_b = 'embeddings/trainx_b' + params.model_name + "_mrpc.csv"
        train_file_y = 'embeddings/trainy_' + params.model_name + "_mrpc.csv"
        self.params = params
        self.adversarialFunc = params.adversarialFunc

        # for key in self.mrpc_data:
        #     logging.info('Computing embedding for {0}'.format(key))
        #     # Sort to reduce padding
        #     text_data = {}
        #     sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
        #                                self.mrpc_data[key]['X_B'],
        #                                self.mrpc_data[key]['y']),
        #                            key=lambda z: (len(z[0]), len(z[1]), z[2]))
        #
        #     text_data['A'] = [x for (x, y, z) in sorted_corpus]
        #     text_data['B'] = [y for (x, y, z) in sorted_corpus]
        #     text_data['y'] = [z for (x, y, z) in sorted_corpus]
        #
        #     for txt_type in ['A', 'B']:
        #         mrpc_embed[key][txt_type] = []
        #         for ii in range(0, len(text_data['y']), params.batch_size):
        #             n = len(text_data['y']) / params.batch_size
        #             if ((ii/params.batch_size)*100/n) % 10 == 0:
        #                 print("%d percent done out of %d"%( ((ii/params.batch_size)*100/n), len(text_data['y'])))
        #             batch = text_data[txt_type][ii:ii + params.batch_size]
        #             embeddings = batcher(params, batch)
        #             mrpc_embed[key][txt_type].append(embeddings)
        #         mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
        #     mrpc_embed[key]['y'] = np.array(text_data['y'])
        #     logging.info('Computed {0} embeddings'.format(key))
        #
        #
        #
        # pickle.dump(mrpc_embed['test']['A'], open(test_file_x_a, 'wb'))
        # pickle.dump(mrpc_embed['test']['B'], open(test_file_x_b, 'wb'))
        # pickle.dump(mrpc_embed['test']['y'], open(test_file_y, 'wb'))
        #
        # pickle.dump(mrpc_embed['train']['A'], open(train_file_x_a, 'wb'))
        # pickle.dump(mrpc_embed['train']['B'], open(train_file_x_b, 'wb'))
        # pickle.dump(mrpc_embed['train']['y'], open(train_file_y, 'wb'))
        #
        # print("dumped embedding files")

        logging.info("reading files")
        mrpc_embed['test']['A'] = pickle.load(open(test_file_x_a, 'rb'))
        mrpc_embed['test']['B'] = pickle.load(open(test_file_x_b, 'rb'))
        mrpc_embed['test']['y'] = pickle.load(open(test_file_y, 'rb'))

        mrpc_embed['train']['A'] = pickle.load(open(train_file_x_a, 'rb'))
        mrpc_embed['train']['B'] = pickle.load(open(train_file_x_b, 'rb'))
        mrpc_embed['train']['y'] = pickle.load(open(train_file_y, 'rb'))

        # Train
        trainA = mrpc_embed['train']['A']
        trainB = mrpc_embed['train']['B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = mrpc_embed['train']['y']

        # Test
        testA = mrpc_embed['test']['A']
        testB = mrpc_embed['test']['B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = mrpc_embed['test']['y']

        print("trainf vector shape", trainF.shape)

        config = {
            'nclasses':
            2,
            'seed':
            self.seed,
            'usepytorch':
            params.usepytorch,
            'classifier':
            params.classifier,
            'nhid':
            params.nhid,
            'kfold':
            params.kfold,
            'adversarial_sample_generator':
            self.generate_adv_samples
            if self.adversarialFunc is not None else None,
            'batcher':
            batcher if batcher is not None else None
        }

        # X = {'train': {}, 'valid': {}, 'test': {}}
        # y = {'train': {}, 'valid': {}, 'test': {}}
        #
        # for key in mrpc_embed.keys():
        #     X[key] = mrpc_embed.get(key)['X']
        #     y[key] = mrpc_embed.get(key)['y']

        params.task_name = "mrpc"
        clf = KFoldClassifier(train={
            'X': trainF,
            'y': trainY
        },
                              test={
                                  'X': testF,
                                  'y': testY
                              },
                              config=config)

        devacc, testacc, yhat = clf.run(params)
        testf1 = round(100 * f1_score(testY, yhat), 2)
        logging.debug(
            'Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'.format(
                devacc, testacc, testf1))
        return {
            'devacc': devacc,
            'acc': testacc,
            'f1': testf1,
            'ndev': len(trainA),
            'ntest': len(testA)
        }
Esempio n. 7
0
    def run(self, params, batcher):
        qa_embed = {'train': {}, 'test': {}}

        for key in self.qa_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            text_data = {}
            sorted_corpus = sorted(zip(self.qa_data[key]['question'],
                                       self.qa_data[key]['snippet'],
                                       self.qa_data[key]['label']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
            text_data['question'] = [x for (x, y, z) in sorted_corpus]
            text_data['snippet'] = [y for (x, y, z) in sorted_corpus]
            text_data['label'] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ['question', 'snippet']:
                qa_embed[key][txt_type] = []
                for ii in range(0, len(text_data['label']), params.batch_size):
                    batch = text_data[txt_type][ii:ii + params.batch_size]
                    #print(batch)
                    embeddings = batcher(params, batch)
                    #print(embeddings.shape)
                    #for i,j in zip(batch,embeddings):
                    #    print(i,j)
                    qa_embed[key][txt_type].append(embeddings)
                qa_embed[key][txt_type] = np.vstack(qa_embed[key][txt_type])
            qa_embed[key]['label'] = np.array(text_data['label'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainQ = qa_embed['train']['question']
        trainS = qa_embed['train']['snippet']
        #trainQS = np.c_[np.abs(trainQ - trainS), trainQ * trainS]
        trainQS = np.hstack(
            (trainQ, trainS, trainQ * trainS, np.abs(trainQ - trainS)))
        trainY = qa_embed['train']['label']
        #print(trainQ)
        #print(trainS)
        #print(trainQS)

        # Test
        testQ = qa_embed['test']['question']
        testS = qa_embed['test']['snippet']
        #testQS = np.c_[np.abs(testQ - testS), testQ * testS]
        testQS = np.hstack(
            (testQ, testS, testQ * testS, np.abs(testQ - testS)))
        testY = qa_embed['test']['label']

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'nhid': params.nhid,
            'kfold': params.kfold
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 1
        config_classifier['epoch_size'] = 64
        config_classifier['batch_size'] = 64
        config['classifier'] = config_classifier
        print(config_classifier)

        clf = KFoldClassifier(train={
            'X': trainQS,
            'y': trainY
        },
                              test={
                                  'X': testQS,
                                  'y': testY
                              },
                              config=config)

        devacc, testacc, yhat = clf.run()
        testf1 = round(100 * f1_score(testY, yhat), 2)
        logging.debug(
            'Dev acc : {0} Test acc {1}; Test F1 {2} for BioASQ 5b task (yes/no questions).\n'
            .format(devacc, testacc, testf1))
        return {
            'devacc': devacc,
            'acc': testacc,
            'f1': testf1,
            'ndev': len(trainQS),
            'ntest': len(testQS)
        }
Esempio n. 8
0
    def run(self, params, batcher):
        rqe_embed = {'train': {}, 'test': {}}

        for key in self.rqe_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            text_data = {}
            sorted_corpus = sorted(zip(self.rqe_data[key]['chq'],
                                       self.rqe_data[key]['faq'],
                                       self.rqe_data[key]['label'],
                                       self.rqe_data[key]['pid']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
            text_data['chq'] = [x for (x, y, z, w) in sorted_corpus]
            text_data['faq'] = [y for (x, y, z, w) in sorted_corpus]
            text_data['label'] = [z for (x, y, z, w) in sorted_corpus]
            text_data['pid'] = [w for (x, y, z, w) in sorted_corpus]
            for txt_type in ['chq', 'faq']:
                rqe_embed[key][txt_type] = []
                for ii in range(0, len(text_data['label']), params.batch_size):
                    batch = text_data[txt_type][ii:ii + params.batch_size]
                    embeddings = batcher(params, batch)
                    rqe_embed[key][txt_type].append(embeddings)
                rqe_embed[key][txt_type] = np.vstack(rqe_embed[key][txt_type])
            rqe_embed[key]['label'] = np.array(text_data['label'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainC = rqe_embed['train']['chq']
        trainF = rqe_embed['train']['faq']
        #print(trainC.shape,trainF.shape,(np.abs(trainC - trainF)).shape, (trainC * trainF).shape)
        #trainCF = np.c_[trainC, trainF,np.abs(trainC - trainF), (trainC * trainF)]
        trainCF = np.hstack(
            (trainC, trainF, trainC * trainF, np.abs(trainC - trainF)))
        trainY = rqe_embed['train']['label']

        # Test
        testC = rqe_embed['test']['chq']
        testF = rqe_embed['test']['faq']
        #testCF = np.c_[testC, testF,  np.abs(testC - testF), testC * testF]
        testCF = np.hstack(
            (testC, testF, testC * testF, np.abs(testC - testF)))
        testY = rqe_embed['test']['label']

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'nhid': params.nhid,
            'kfold': params.kfold
        }
        clf = KFoldClassifier(train={
            'X': trainCF,
            'y': trainY
        },
                              test={
                                  'X': testCF,
                                  'y': testY
                              },
                              config=config)

        devacc, testacc, yhat = clf.run()
        pred = []
        print(text_data['pid'])
        for i in yhat:
            pred.append(i)
        print(pred)
        testf1 = round(100 * f1_score(testY, yhat), 2)
        logging.debug(
            'Dev acc : {0} Test acc {1}; Test F1 {2} for RQE.\n'.format(
                devacc, testacc, testf1))
        return {
            'devacc': devacc,
            'acc': testacc,
            'f1': testf1,
            'ndev': len(trainCF),
            'ntest': len(testCF)
        }
Esempio n. 9
0
    def run(self, params, batcher):
        train_embeddings, test_embeddings = [], []

        # Sort to reduce padding
        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
                                     key=lambda z: (len(z[0]), z[1]))
        train_samples = [x for (x, y) in sorted_corpus_train]
        train_labels = [y for (x, y) in sorted_corpus_train]

        zipped_corpus_test = sorted(enumerate(
            zip(self.test['X'], self.test['y'])),
                                    key=lambda z: (len(z[1][0]), z[1][1]))
        sorted_test_indices = [i for (i, z) in zipped_corpus_test]
        test_samples = [x for (i, (x, y)) in zipped_corpus_test]
        test_labels = [y for (i, (x, y)) in zipped_corpus_test]

        # Get train embeddings
        for ii in range(0, len(train_labels), params.batch_size):
            batch = train_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            train_embeddings.append(embeddings)
        train_embeddings = np.vstack(train_embeddings)
        logging.info('Computed train embeddings')

        # Get test embeddings
        for ii in range(0, len(test_labels), params.batch_size):
            batch = test_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            test_embeddings.append(embeddings)
        test_embeddings = np.vstack(test_embeddings)
        logging.info('Computed test embeddings')

        config_classifier = {
            'nclasses': 6,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'kfold': params.kfold
        }
        clf = KFoldClassifier(
            {
                'X': train_embeddings,
                'y': np.array(train_labels)
            }, {
                'X': test_embeddings,
                'y': np.array(test_labels)
            }, config_classifier)
        devacc, testacc, yhat_sorted = clf.run()
        yhat = [None] * len(yhat_sorted)
        for (i, y) in enumerate(yhat_sorted):
            yhat[sorted_test_indices[i]] = y
        logging.debug('\nDev acc : {0} Test acc : {1} \
            for TREC\n'.format(devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.train['X']),
            'ntest': len(self.test['X']),
            'yhat': yhat,
            'metadata': self.metadata
        }