Exemple #1
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)

            enc1 = batcher(params, input1)
            enc2 = batcher(params, input2)
            print(enc2.shape)
            enc_input = np.hstack((enc1, enc2, np.abs(enc1 - enc2)))
            #            enc_input = np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))
            #            for ii in range(0, n_labels, params.batch_size):
            #                batch1 = input1[ii:ii + params.batch_size]
            #                batch2 = input2[ii:ii + params.batch_size]
            #
            #                if len(batch1) == len(batch2) and len(batch1) > 0:
            #                    enc1 = batcher(params, batch1)
            #                    enc2 = batcher(params, batch2)
            #                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
            #                                                np.abs(enc1 - enc2))))
            #                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
            #                    logging.info("PROGRESS (encoding): %.2f%%" %
            #                                 (100 * ii / n_labels))
            #            self.X[key] = np.vstack(enc_input)
            self.X[key] = enc_input
            self.y[key] = [dico_label[y] for y in mylabels]

        config = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': 1000,
            'noreg': True
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config_classifier['nhid'] = 1000
        config['classifier'] = config_classifier
        config['nhid'] = 1000
        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc = clf.run()
        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Exemple #2
0
    def run(self, params, batcher):
        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sick_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
                                       self.sick_data[key]['X_B'],
                                       self.sick_data[key]['y']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))

            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ['X_A', 'X_B']:
                params.batcher_dataset = f"{key}_{txt_type}"
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]['y']), bsize):
                    params.batcher_offset = str(ii)
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)
                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainA = sick_embed['train']['X_A']
        trainB = sick_embed['train']['X_B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = np.array(self.sick_data['train']['y'])

        # Dev
        devA = sick_embed['dev']['X_A']
        devB = sick_embed['dev']['X_B']
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = np.array(self.sick_data['dev']['y'])

        # Test
        testA = sick_embed['test']['X_A']
        testB = sick_embed['test']['X_B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = np.array(self.sick_data['test']['y'])

        config = {'nclasses': 3, 'seed': self.seed,
                  'usepytorch': params.usepytorch,
                  'classifier': params.classifier,
                  'nhid': params.nhid}
        clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
                              y={'train': trainY, 'valid': devY, 'test': testY},
                              config=config)

        devacc, testacc = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} for \
                       SICK entailment\n'.format(devacc, testacc))
        return {'devacc': devacc, 'acc': testacc,
                'ndev': len(devA), 'ntest': len(testA)}
Exemple #3
0
    def run(self, params, batcher):
        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sst_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            zipped_data = sorted(enumerate(
                zip(self.sst_data[key]['X'], self.sst_data[key]['y'])),
                                 key=lambda z: (len(z[1][0]), z[1][1]))
            if key == 'test':
                sorted_test_indices = [i for (i, z) in zipped_data]
            self.sst_data[key]['X'] = [x for (i, (x, y)) in zipped_data]
            self.sst_data[key]['y'] = [y for (i, (x, y)) in zipped_data]

            sst_embed[key]['X'] = []
            for ii in range(0, len(self.sst_data[key]['y']), bsize):
                batch = self.sst_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                sst_embed[key]['X'].append(embeddings)
            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        config_classifier = {
            'nclasses': self.nclasses,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier
        }

        clf = SplitClassifier(X={
            'train': sst_embed['train']['X'],
            'valid': sst_embed['dev']['X'],
            'test': sst_embed['test']['X']
        },
                              y={
                                  'train': sst_embed['train']['y'],
                                  'valid': sst_embed['dev']['y'],
                                  'test': sst_embed['test']['y']
                              },
                              config=config_classifier)

        devacc, testacc, yhat_sorted = clf.run()
        yhat = [None] * len(yhat_sorted)
        for (i, y) in enumerate(yhat_sorted):
            yhat[sorted_test_indices[i]] = y
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n'.format(devacc, testacc, self.task_name))

        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(sst_embed['dev']['X']),
            'ntest': len(sst_embed['test']['X']),
            'yhat': yhat,
            'metadata': self.metadata
        }
Exemple #4
0
    def run(self, params, batcher):
        self.X, self.y, self.idxs = {}, {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []
            if key not in self.idxs:
               self.idxs[key] = []

            if "test" in key or "diagnostic" in key:
                input1, input2, mylabels, idxs = self.data[key]
                self.idxs[key] = idxs
            else:
                input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels))
            logging.debug("Finished encoding MNLI")
            self.X[key] = np.vstack(enc_input)
            self.y[key] = mylabels
            del enc_input

        config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch,
                  'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False}

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config) # maybe assert that the order isn't changed
        logging.debug("Built classifier, starting training")
        devacc, testacc, test_preds = clf.run()
        test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test'])

        mm_acc = round(100*clf.clf.score(self.X['test_mismatched'], self.y['test_mismatched']), 2)
        mm_preds = clf.clf.predict(self.X['test_mismatched'])
        mm_preds = sort_preds(mm_preds.squeeze().tolist(), self.idxs['test_mismatched'])
        d_acc = round(100*clf.clf.score(self.X['diagnostic'], self.y['diagnostic']), 2)
        d_preds = clf.clf.predict(self.X['diagnostic'])
        d_preds = sort_preds(d_preds.squeeze().tolist(), self.idxs['diagnostic'])

        logging.debug('Dev acc : {0} Matched test acc : {1} Mismatched test acc: {2} for MNLI\n'.format(devacc, testacc, mm_acc))
        return {'devacc': devacc,
                'matched_acc': testacc, 'preds': test_preds,
                'mismatched_acc': mm_acc, 'mismatched_preds': mm_preds,
                'diagnostic_acc': d_acc, 'diagnostic_preds': d_preds,
                'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0])}
Exemple #5
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(
                        np.hstack(
                            (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = [dico_label[y] for y in mylabels]

        config = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': True
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc, yhat_sorted = clf.run()
        yhat = [None] * len(yhat_sorted)
        for (i, y) in enumerate(yhat_sorted):
            yhat[self.sorted_test_indices[i]] = y

        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0]),
            'yhat': yhat,
            'metadata': self.metadata
        }
Exemple #6
0
    def run(self, params, batcher):
        task_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size
        logging.info('Computing embeddings for train/dev/test')
        for key in self.task_data:
            # Sort to reduce padding
            sorted_data = sorted(zip(self.task_data[key]['X'],
                                     self.task_data[key]['y']),
                                 key=lambda z: (len(z[0]), z[1]))
            self.task_data[key]['X'], self.task_data[key]['y'] = map(
                list, zip(*sorted_data))

            task_embed[key]['X'] = []
            for ii in range(0, len(self.task_data[key]['y']), bsize):
                batch = self.task_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                task_embed[key]['X'].append(embeddings)
            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
            task_embed[key]['y'] = np.array(self.task_data[key]['y'])
        logging.info('Computed embeddings')

        config_classifier = {
            'nclasses': self.nclasses,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'noreg': params.noreg
        }

        if self.task == "WordContent" and params.classifier['nhid'] > 0:
            config_classifier = copy.deepcopy(config_classifier)
            config_classifier['classifier']['nhid'] = 0
            print(params.classifier['nhid'])

        clf = SplitClassifier(X={
            'train': task_embed['train']['X'],
            'valid': task_embed['dev']['X'],
            'test': task_embed['test']['X']
        },
                              y={
                                  'train': task_embed['train']['y'],
                                  'valid': task_embed['dev']['y'],
                                  'test': task_embed['test']['y']
                              },
                              config=config_classifier)

        devacc, testacc = clf.run()
        logging.debug(
            '\nDev acc : %.1f Test acc : %.1f for %s classification\n' %
            (devacc, testacc, self.task.upper()))

        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(task_embed['dev']['X']),
            'ntest': len(task_embed['test']['X'])
        }
Exemple #7
0
    def run(self, params, batcher):
        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sst_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_data = sorted(zip(self.sst_data[key]['X'],
                                     self.sst_data[key]['y']),
                                 key=lambda z: (len(z[0]), z[1]))
            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(
                list, zip(*sorted_data))

            sst_embed[key]['X'] = []
            for ii in range(0, len(self.sst_data[key]['y']), bsize):
                batch = self.sst_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                sst_embed[key]['X'].append(embeddings)
            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        config_classifier = {
            'nclasses': self.nclasses,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier
        }

        clf = SplitClassifier(X={
            'train': sst_embed['train']['X'],
            'valid': sst_embed['dev']['X'],
            'test': sst_embed['test']['X']
        },
                              y={
                                  'train': sst_embed['train']['y'],
                                  'valid': sst_embed['dev']['y'],
                                  'test': sst_embed['test']['y']
                              },
                              config=config_classifier)

        classifier, devacc, testacc = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n'.format(devacc, testacc, self.task_name))

        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(sst_embed['dev']['X']),
            'ntest': len(sst_embed['test']['X']),
            'classifier': classifier,
            'X_train': sst_embed['train']['X'],
            #'Y': np.concatenate((sst_embed['train']['y'], sst_embed['dev']['y'], sst_embed['test']['y']), axis=0)
            'X_test': sst_embed['test']['X'],
            'text': self.sst_data['train']['X']
        }
Exemple #8
0
    def run(self, params, batcher):
        sst_embed = {"train": {}, "dev": {}, "test": {}}
        bsize = params.batch_size

        for key in self.sst_data:
            logging.info("Computing embedding for {0}".format(key))
            # Sort to reduce padding
            sorted_data = sorted(
                zip(self.sst_data[key]["X"], self.sst_data[key]["y"]),
                key=lambda z: (len(z[0]), z[1]),
            )
            self.sst_data[key]["X"], self.sst_data[key]["y"] = map(
                list, zip(*sorted_data))

            sst_embed[key]["X"] = []
            for ii in range(0, len(self.sst_data[key]["y"]), bsize):
                batch = self.sst_data[key]["X"][ii:ii + bsize]
                embeddings = batcher(params, batch)
                sst_embed[key]["X"].append(embeddings)
            sst_embed[key]["X"] = np.vstack(sst_embed[key]["X"])
            sst_embed[key]["y"] = np.array(self.sst_data[key]["y"])
            logging.info("Computed {0} embeddings".format(key))

        config_classifier = {
            "nclasses": self.nclasses,
            "seed": self.seed,
            "usepytorch": params.usepytorch,
            "classifier": params.classifier,
        }

        clf = SplitClassifier(
            X={
                "train": sst_embed["train"]["X"],
                "valid": sst_embed["dev"]["X"],
                "test": sst_embed["test"]["X"],
            },
            y={
                "train": sst_embed["train"]["y"],
                "valid": sst_embed["dev"]["y"],
                "test": sst_embed["test"]["y"],
            },
            config=config_classifier,
        )

        devacc, testacc = clf.run()
        devacc = devacc.tolist()
        logging.debug("\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n".format(devacc, testacc, self.task_name))

        return {
            "devacc": devacc,
            "acc": testacc,
            "ndev": len(sst_embed["dev"]["X"]),
            "ntest": len(sst_embed["test"]["X"]),
        }
Exemple #9
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(
                        np.hstack(
                            (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = [dico_label[y] for y in mylabels]

        config_classifier = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'classifier': params.classifier,
            'nhid': params.nhid,
            'maxepoch': 40,
            'nepoches': 4,
            'noreg': False
        }
        clf = SplitClassifier(self.X, self.y, config_classifier)
        devacc, testacc = clf.run()
        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Exemple #10
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
                                                np.abs(enc1 - enc2))))
                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            #self.y[key] = [dico_label[y] for y in mylabels]
            self.y[key] = mylabels

        config = {'nclasses': 3, 'seed': self.seed,
                  'usepytorch': params.usepytorch,
                  'cudaEfficient': True,
                  'nhid': params.nhid, 'noreg': True}

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc, test_preds = clf.run()
        logging.debug('Dev acc : {0} Test acc : {1} for ANLI\n'.format(devacc, testacc))
        return {'devacc': devacc, 'acc': testacc, 'preds': test_preds,
                'ndev': len(self.data['valid'][0]),
                'ntest': len(self.data['test'][0])}
Exemple #11
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels, ids = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(
                        np.hstack(
                            (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii * params.batch_size) % (200 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            try:
                self.y[key] = [dico_label[y] for y in mylabels]
            except:
                logging.info(' key error')
                continue

        config = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': True
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)

        devacc, testacc, preds, probs = clf.run()

        print(self.data['test'][0])
        print(self.data['test'][1])
        print(preds)
        print(probs)
        pp = []
        i = 0
        while i < 52:
            pp.append([probs[i], probs[i + 1], probs[i + 2]])
            i = i + 3
        dico_label = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
        predictions = []
        for i in preds:
            predictions.append(dico_label[i[0]])
        for i, j, k, l in zip(pp, predictions, self.data['test'][0],
                              self.data['test'][1]):
            print(" ".join(k), "\t", " ".join(l), "\t", j, "\t", i)

        logging.debug('Dev acc : {0} Test acc : {1} for MedNLI\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Exemple #12
0
    def run(self, params, batcher):
        self.X, self.y, self.idxs = {}, {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []
            if key not in self.idxs:
                self.idxs[key] = []

            #if len(self.data[key]) == 2:
            if key == 'test':
                if len(self.data[key]) == 2:
                    input1, idxs = self.data[key]
                    mylabels = [0] * len(idxs)
                elif len(self.data[key]) == 3:
                    input1, mylabels, idxs = self.data[key]
                self.idxs[key] = idxs
            else:
                input1, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]

                if len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc_input.append(enc1)
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = mylabels

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': False
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc, test_preds = clf.run()
        dev_preds = clf.clf.predict(self.X['valid'])
        dev_mcc = matthews_corrcoef(self.y['valid'], dev_preds.squeeze())
        test_mcc = matthews_corrcoef(self.y['test'], test_preds.squeeze())
        test_preds = sort_preds(test_preds.squeeze().tolist(),
                                self.idxs['test'])
        logging.debug(
            'Dev acc : {0} Dev MCC : {3} Test acc : {1} Test MCC : {2} for CoLA\n'
            .format(devacc, testacc, test_mcc, dev_mcc))
        return {
            'devacc': devacc,
            'devmcc': dev_mcc,
            'acc': testacc,
            'mcc': test_mcc,
            'preds': test_preds,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Exemple #13
0
    def run(self, params, batcher):
        if (params.train is not None and params.train == False):
            sst_embed = {'dev': {}, 'test': {}}
        else:
            sst_embed = {'train': {}, 'dev': {}, 'test': {}}
        train_file_x = 'embeddings/trainx_' + params.model_name + "_sst.csv"
        train_file_y = 'embeddings/trainy_' + params.model_name + "_sst.csv"
        test_file_x = 'embeddings/testx_' + params.model_name + "_sst.csv"
        test_file_y = 'embeddings/testy_' + params.model_name + "_sst.csv"
        dev_file_x = 'embeddings/devx_' + params.model_name + "_sst.csv"
        dev_file_y = 'embeddings/devy_' + params.model_name + "_sst.csv"
        bsize = params.batch_size
        self.params = params
        self.adversarialFunc = params.adversarialFunc

        # for key in self.sst_data:
        #     logging.info('Computing embedding for {0}'.format(key))
        #     # Sort to reduce padding
        #     sorted_data = sorted(zip(self.sst_data[key]['X'],
        #                              self.sst_data[key]['y']),
        #                          key=lambda z: (len(z[0]), z[1]))
        #     self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
        #
        #     sst_embed[key]['X'] = []
        #     for ii in range(0, len(self.sst_data[key]['y']), bsize):
        #         n = len(self.sst_data[key]['y'])/bsize
        #         # if ((ii/bsize)*100/n) % 10 == 0:
        #         print("%d percent done out of %d"%( ((ii/bsize)*100/n), len(self.sst_data[key]['y'])))
        #         batch = self.sst_data[key]['X'][ii:ii + bsize]
        #         embeddings = batcher(params, batch)
        #         sst_embed[key]['X'].append(embeddings)
        #         # logging.info('computed batch {0}, out of total {1}'.format(ii,bsize))
        #     sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
        #     sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
        #     logging.info('Computed {0} embeddings'.format(key))
        #
        # pickle.dump(sst_embed['train']['X'], open(train_file_x, 'wb'))
        # pickle.dump(sst_embed['train']['y'], open(train_file_y, 'wb'))
        #
        # pickle.dump(sst_embed['test']['X'], open(test_file_x, 'wb'))
        # pickle.dump(sst_embed['test']['y'], open(test_file_y, 'wb'))
        # pickle.dump(sst_embed['dev']['X'], open(dev_file_x, 'wb'))
        # pickle.dump(sst_embed['dev']['y'], open(dev_file_y, 'wb'))

        logging.info("dumped files")

        sst_embed['train']['X'] = pickle.load(open(train_file_x, 'rb'))
        sst_embed['train']['y'] = pickle.load(open(train_file_y, 'rb'))
        sst_embed['test']['X'] = pickle.load(open(test_file_x, 'rb'))
        sst_embed['test']['y'] = pickle.load(open(test_file_y, 'rb'))
        sst_embed['dev']['X'] = pickle.load(open(dev_file_x, 'rb'))
        sst_embed['dev']['y'] = pickle.load(open(dev_file_y, 'rb'))
        logging.info("loaded sst embeddings.")

        # print "printing to check if wordvecs fored correct\n"
        #
        # for word in self.sst_data['test']['X'][0]:
        #     print word, "-"*30
        #     print params.word_vec[word][:20]
        # print "sent embedding", "-"*30
        # print sst_embed['test']['X'][0][:20]
        # print "\n\n"

        config_classifier = {
            'nclasses':
            self.nclasses,
            'seed':
            self.seed,
            'usepytorch':
            params.usepytorch,
            'classifier':
            params.classifier,
            'adversarial_sample_generator':
            self.generate_adv_samples
            if self.adversarialFunc is not None else None
        }

        if params.train is not None and params.train == False:
            X = {
                'train': {},
                'valid': sst_embed['dev']['X'],
                'test': sst_embed['test']['X']
            }
            y = {
                'train': {},
                'valid': sst_embed['dev']['y'],
                'test': sst_embed['test']['y']
            }

        else:
            X = {
                'train': sst_embed['train']['X'],
                'valid': sst_embed['dev']['X'],
                'test': sst_embed['test']['X']
            }
            y = {
                'train': sst_embed['train']['y'],
                'valid': sst_embed['dev']['y'],
                'test': sst_embed['test']['y']
            }

        # X = {'train': {}, 'valid': {}, 'test': {}}
        # y = {'train': {}, 'valid': {}, 'test': {}}
        #
        # for key in sst_embed.keys():
        #     X[key] = sst_embed.get(key)['X']
        #     y[key] = sst_embed.get(key)['y']

        # X = {'train': {},
        #      'valid': sst_embed['dev']['X'],
        #      'test': sst_embed['test']['X']}
        # y = {'train': {},
        #      'valid': sst_embed['dev']['y'],
        #      'test': sst_embed['test']['y']}

        clf = SplitClassifier(X,
                              y,
                              config=config_classifier,
                              test_dataX=self.sst_data['test']['X'],
                              test_dataY=self.sst_data['test']['y'])
        params.task_name = "sst"
        devacc, testacc, adv_results = clf.run(params)
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n'.format(devacc, testacc, self.task_name))

        results = dict()
        results['task_results'] = {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(sst_embed['dev']['X']),
            'ntest': len(sst_embed['test']['X'])
        }

        results['adv_results'] = adv_results
        print("added adv results to pass back")
        return results
Exemple #14
0
    def run(self, params, batcher):
        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sst_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_data = sorted(zip(self.sst_data[key]['X'],
                                     self.sst_data[key]['y']),
                                 key=lambda z: (len(z[0]), z[1]))
            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(
                list, zip(*sorted_data))

            sst_embed[key]['X'] = []
            for ii in range(0, len(self.sst_data[key]['y']), bsize):
                batch = self.sst_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                sst_embed[key]['X'].append(embeddings)
            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        dev_length = len(sst_embed['dev']['X'])
        test_length = len(sst_embed['test']['X'])

        embeddings = []
        index = 0

        trainX_indexes = []
        for embedding in sst_embed['train']['X']:
            embeddings.append(embedding)
            trainX_indexes.append(index)
            index += 1
        trainX_indexes = np.vstack(trainX_indexes)
        del sst_embed['train']['X']
        trainy_indexes = sst_embed['train']['y']
        del sst_embed['train']['y']

        devX_indexes = []
        for embedding in sst_embed['dev']['X']:
            embeddings.append(embedding)
            devX_indexes.append(index)
            index += 1
        devX_indexes = np.vstack(devX_indexes)
        del sst_embed['dev']['X']
        devy_indexes = sst_embed['dev']['y']
        del sst_embed['dev']['y']

        testX_indexes = []
        for embedding in sst_embed['test']['X']:
            embeddings.append(embedding)
            testX_indexes.append(index)
            index += 1
        testX_indexes = np.vstack(testX_indexes)
        del sst_embed['test']['X']
        testy_indexes = sst_embed['test']['y']
        del sst_embed['test']['y']

        sst_embed = None
        embeddings = np.vstack(embeddings)

        config_classifier = {
            'nclasses': self.nclasses,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier
        }

        clf = SplitClassifier(X={
            'train': trainX_indexes,
            'valid': devX_indexes,
            'test': testX_indexes
        },
                              y={
                                  'train': trainy_indexes,
                                  'valid': devy_indexes,
                                  'test': testy_indexes
                              },
                              embeddings=embeddings,
                              config=config_classifier)

        devacc, testacc = clf.run()
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n'.format(devacc, testacc, self.task_name))

        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': dev_length,
            'ntest': test_length
        }
Exemple #15
0
    def run(self, params, batcher):
        task_embed = {"train": {}, "dev": {}, "test": {}}
        bsize = params.batch_size
        logging.info("Computing embeddings for train/dev/test")
        for key in self.task_data:
            # Sort to reduce padding
            sorted_data = sorted(
                zip(
                    self.task_data[key]["X"],
                    self.task_data[key]["id"],
                    self.task_data[key]["y"],
                    self.task_data[key]["head"],
                    self.task_data[key]["tail"],
                    self.task_data[key]["ner"],
                    self.task_data[key]["pos"],
                    self.task_data[key]["dep"],
                    self.task_data[key]["dep_head"],
                ),
                key=lambda z: (len(z[0]), z[1]),
            )
            (
                self.task_data[key]["X"],
                self.task_data[key]["id"],
                self.task_data[key]["y"],
                self.task_data[key]["head"],
                self.task_data[key]["tail"],
                self.task_data[key]["ner"],
                self.task_data[key]["pos"],
                self.task_data[key]["dep"],
                self.task_data[key]["dep_head"],
            ) = map(list, zip(*sorted_data))

            task_embed[key]["X"] = []
            for ii in range(0, len(self.task_data[key]["y"]), bsize):
                batch = self.task_data[key]["X"][ii:ii + bsize]
                id_ = self.task_data[key]["id"][ii:ii + bsize]
                id_ = id_ if id_ != "None" else None
                head = self.task_data[key]["head"][ii:ii + bsize]
                tail = self.task_data[key]["tail"][ii:ii + bsize]
                ner = self.task_data[key]["ner"][ii:ii + bsize]
                pos = self.task_data[key]["pos"][ii:ii + bsize]
                dep = self.task_data[key]["dep"][ii:ii + bsize]
                dep_head = self.task_data[key]["dep_head"][ii:ii + bsize]

                embeddings = batcher(params, batch, head, tail, ner, pos, dep,
                                     dep_head, id_)
                task_embed[key]["X"].append(embeddings)
            task_embed[key]["X"] = np.vstack(task_embed[key]["X"])
            task_embed[key]["y"] = np.array(self.task_data[key]["y"])
        logging.info("Computed embeddings")

        config_classifier = {
            "nclasses": self.nclasses,
            "seed": self.seed,
            "usepytorch": params.usepytorch,
            "classifier": params.classifier,
        }

        # if self.task == "WordContent" and params.classifier["nhid"] > 0:
        #     config_classifier = copy.deepcopy(config_classifier)
        #     config_classifier["classifier"]["nhid"] = 0
        #     print(params.classifier["nhid"])

        clf = SplitClassifier(
            X={
                "train": task_embed["train"]["X"],
                "valid": task_embed["dev"]["X"],
                "test": task_embed["test"]["X"],
            },
            y={
                "train": task_embed["train"]["y"],
                "valid": task_embed["dev"]["y"],
                "test": task_embed["test"]["y"],
            },
            config=config_classifier,
        )

        devacc, testacc = clf.run()
        logging.debug(
            "\nDev acc : %.1f Test acc : %.1f for %s classification\n" %
            (devacc, testacc, self.task.upper()))

        return {
            "devacc": devacc,
            "acc": testacc,
            "ndev": len(task_embed["dev"]["X"]),
            "ntest": len(task_embed["test"]["X"]),
        }
Exemple #16
0
    def run(self, params, batcher):
        rqe_embed = {'dev': {}, 'train': {}, 'test': {}}

        for key in self.rqe_data:
            print('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            text_data = {}
            sorted_corpus = sorted(zip(self.rqe_data[key]['chq'],
                                       self.rqe_data[key]['faq'],
                                       self.rqe_data[key]['label'],
                                       self.rqe_data[key]['pid']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
            text_data['chq'] = [x for (x, y, z, w) in sorted_corpus]
            text_data['faq'] = [y for (x, y, z, w) in sorted_corpus]
            text_data['label'] = [z for (x, y, z, w) in sorted_corpus]
            text_data['pid'] = [w for (x, y, z, w) in sorted_corpus]
            for txt_type in ['chq', 'faq']:
                rqe_embed[key][txt_type] = []
                for ii in range(0, len(text_data['label']), params.batch_size):
                    batch = text_data[txt_type][ii:ii + params.batch_size]
                    #print(batch)
                    embeddings = batcher(params, batch)
                    rqe_embed[key][txt_type].append(embeddings)
                rqe_embed[key][txt_type] = np.vstack(rqe_embed[key][txt_type])
            rqe_embed[key]['label'] = np.array(text_data['label'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainC = rqe_embed['train']['chq']
        trainF = rqe_embed['train']['faq']
        #print(trainC.shape,trainF.shape,(np.abs(trainC - trainF)).shape, (trainC * trainF).shape)
        #trainCF = np.c_[trainC, trainF,np.abs(trainC - trainF), (trainC * trainF)]
        trainCF = np.hstack(
            (trainC, trainF, trainC * trainF, np.abs(trainC - trainF)))
        trainY = rqe_embed['train']['label']
        print('Done embdding for train')

        # Test
        testC = rqe_embed['test']['chq']
        testF = rqe_embed['test']['faq']
        #testCF = np.c_[testC, testF,  np.abs(testC - testF), testC * testF]
        testCF = np.hstack(
            (testC, testF, testC * testF, np.abs(testC - testF)))
        testY = rqe_embed['test']['label']

        print('Done embdding for test')

        # dev
        devC = rqe_embed['dev']['chq']
        devF = rqe_embed['dev']['faq']
        #testCF = np.c_[testC, testF,  np.abs(testC - testF), testC * testF]
        devCF = np.hstack((devC, devF, devC * devF, np.abs(devC - devF)))
        devY = rqe_embed['dev']['label']

        print('Done embdding for dev')

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier,
            'nhid': params.nhid,
            'kfold': params.kfold
        }
        print(len(devCF), len(devY), len(testCF), len(testY))
        clf = SplitClassifier(X={
            'train': trainCF,
            'valid': devCF,
            'test': testCF
        },
                              y={
                                  'train': trainY,
                                  'valid': devY,
                                  'test': testY
                              },
                              config=config)
        devacc, testacc, yhat = clf.run()

        pred = []
        print(text_data['pid'])
        for i in yhat:
            pred.append(i)
        print(pred)
        logging.debug('Dev acc : {0} Test acc {1};  for RQE.\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(trainCF),
            'ntest': len(testCF)
        }
Exemple #17
0
    def run(self, params, batcher):
        task_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size
        logging.info('Computing embeddings for train/dev/test')
        for key in self.task_data:
            # Sort to reduce padding
            zipped_data = sorted(enumerate(
                zip(self.task_data[key]['X'], self.task_data[key]['y'])),
                                 key=lambda z: (len(z[1][0]), z[1][1]))
            if key == 'test':
                sorted_test_indices = [i for (i, z) in zipped_data]
            self.task_data[key]['X'] = [x for (i, (x, y)) in zipped_data]
            self.task_data[key]['y'] = [y for (i, (x, y)) in zipped_data]

            task_embed[key]['X'] = []
            for ii in range(0, len(self.task_data[key]['y']), bsize):
                batch = self.task_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                task_embed[key]['X'].append(embeddings)
            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
            task_embed[key]['y'] = np.array(self.task_data[key]['y'])
        logging.info('Computed embeddings')

        config_classifier = {
            'nclasses': self.nclasses,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'classifier': params.classifier
        }

        if self.task == "WordContent" and params.classifier['nhid'] > 0:
            config_classifier = copy.deepcopy(config_classifier)
            config_classifier['classifier']['nhid'] = 0
            print(params.classifier['nhid'])

        clf = SplitClassifier(X={
            'train': task_embed['train']['X'],
            'valid': task_embed['dev']['X'],
            'test': task_embed['test']['X']
        },
                              y={
                                  'train': task_embed['train']['y'],
                                  'valid': task_embed['dev']['y'],
                                  'test': task_embed['test']['y']
                              },
                              config=config_classifier)

        devacc, testacc, yhat_sorted = clf.run()
        yhat = [None] * len(yhat_sorted)
        for (i, y) in enumerate(yhat_sorted):
            yhat[sorted_test_indices[i]] = y
        logging.debug(
            '\nDev acc : %.1f Test acc : %.1f for %s classification\n' %
            (devacc, testacc, self.task.upper()))

        return {
            'devacc': devacc,
            'acc': testacc,
            'ndev': len(task_embed['dev']['X']),
            'ntest': len(task_embed['test']['X']),
            'metadata': self.metadata,
            'yhat': yhat
        }
Exemple #18
0
    def run(self, params, batcher):
        train_embeddings, valid_embeddings, test_embeddings = [], [], []

        # Sort to reduce padding
        sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']),
                                     key=lambda z: (len(z[0]), z[1]))
        train_samples = [x for (x, y) in sorted_corpus_train]
        train_labels = [y for (x, y) in sorted_corpus_train]
        
        sorted_corpus_valid = sorted(zip(self.valid['X'], self.valid['y']),
                                     key=lambda z: (len(z[0]), z[1]))
        valid_samples = [x for (x, y) in sorted_corpus_valid]
        valid_labels = [y for (x, y) in sorted_corpus_valid]

        sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']),
                                    key=lambda z: (len(z[0]), z[1]))
        test_samples = [x for (x, y) in sorted_corpus_test]
        test_labels = [y for (x, y) in sorted_corpus_test]

        # Get train embeddings
        for ii in range(0, len(train_labels), params.batch_size):
            batch = train_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            train_embeddings.append(embeddings)
        train_embeddings = np.vstack(train_embeddings)
        logging.info('Computed train embeddings')
        
        # Get validation embeddings
        for ii in range(0, len(valid_labels), params.batch_size):
            batch = valid_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            valid_embeddings.append(embeddings)
        valid_embeddings = np.vstack(valid_embeddings)
        logging.info('Computed Validation embeddings')

        # Get test embeddings
        for ii in range(0, len(test_labels), params.batch_size):
            batch = test_samples[ii:ii + params.batch_size]
            embeddings = batcher(params, batch)
            test_embeddings.append(embeddings)
        test_embeddings = np.vstack(test_embeddings)
        logging.info('Computed test embeddings')
        
        config= {'nclasses': 7, 'seed': self.seed,
                  'usepytorch': params.usepytorch,
                  'cudaEfficient': True,
                  'nhid': params.nhid, 'noreg': True}

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(X={'train': train_embeddings,
                                 'valid': valid_embeddings,
                                 'test': test_embeddings},
                              y={'train': train_labels,
                                 'valid': valid_labels,
                                 'test': test_labels},
                              config=config)
        devacc, testacc = clf.run()
        logging.debug('Dev acc : {0} Test acc : {1} for PICO\n'
                      .format(devacc, testacc))
        return {'devacc': devacc, 'acc': testacc,
                'ndev': len(self.valid['X']),
                'ntest': len(self.test['X'])}
Exemple #19
0
    def run(self, params, batcher):
        self.X, self.y = {}, {}
        dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []

            input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(
                        np.hstack(
                            (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = [dico_label[y] for y in mylabels]

        ndev = len(self.data['valid'][0])
        ntest = len(self.data['test'][0])
        self.data = None

        config = {
            'nclasses': 3,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': True
        }

        embeddings = []
        index = 0
        X_indexes = dict()
        for key in ['train', 'valid', 'test']:
            X_indexes[key] = []
            for embedding in self.X[key]:
                embeddings.append(embedding)
                X_indexes[key].append(index)
                index += 1
            X_indexes[key] = np.vstack(X_indexes[key])
            del self.X[key]

        self.X = None
        embeddings = np.vstack(embeddings)

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(X_indexes, self.y, embeddings, config)
        devacc, testacc = clf.run()
        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format(
            devacc, testacc))
        return {'devacc': devacc, 'acc': testacc, 'ndev': ndev, 'ntest': ntest}
Exemple #20
0
    def run(self, params, batcher):
        sick_embed = {"train": {}, "dev": {}, "test": {}}
        bsize = params.batch_size

        for key in self.sick_data:
            logging.info("Computing embedding for {0}".format(key))
            # Sort to reduce padding
            sorted_corpus = sorted(
                zip(
                    self.sick_data[key]["X_A"],
                    self.sick_data[key]["X_B"],
                    self.sick_data[key]["y"],
                ),
                key=lambda z: (len(z[0]), len(z[1]), z[2]),
            )

            self.sick_data[key]["X_A"] = [x for (x, y, z) in sorted_corpus]
            self.sick_data[key]["X_B"] = [y for (x, y, z) in sorted_corpus]
            self.sick_data[key]["y"] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ["X_A", "X_B"]:
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]["y"]), bsize):
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)
                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(
                    sick_embed[key][txt_type])
            logging.info("Computed {0} embeddings".format(key))

        # Train
        trainA = sick_embed["train"]["X_A"]
        trainB = sick_embed["train"]["X_B"]
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = np.array(self.sick_data["train"]["y"])

        # Dev
        devA = sick_embed["dev"]["X_A"]
        devB = sick_embed["dev"]["X_B"]
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = np.array(self.sick_data["dev"]["y"])

        # Test
        testA = sick_embed["test"]["X_A"]
        testB = sick_embed["test"]["X_B"]
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = np.array(self.sick_data["test"]["y"])

        config = {
            "nclasses": 3,
            "seed": self.seed,
            "usepytorch": params.usepytorch,
            "classifier": params.classifier,
            "nhid": params.nhid,
        }
        clf = SplitClassifier(
            X={
                "train": trainF,
                "valid": devF,
                "test": testF
            },
            y={
                "train": trainY,
                "valid": devY,
                "test": testY
            },
            config=config,
        )

        devacc, testacc = clf.run()
        devacc = devacc.tolist()
        logging.debug("\nDev acc : {0} Test acc : {1} for \
                       SICK entailment\n".format(devacc, testacc))
        return {
            "devacc": devacc,
            "acc": testacc,
            "ndev": len(devA),
            "ntest": len(testA),
        }