Beispiel #1
0
    def run(self, params, batcher):
        self.X, self.y, self.idxs = {}, {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []
            if key not in self.idxs:
               self.idxs[key] = []

            if "test" in key or "diagnostic" in key:
                input1, input2, mylabels, idxs = self.data[key]
                self.idxs[key] = idxs
            else:
                input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels))
            logging.debug("Finished encoding MNLI")
            self.X[key] = np.vstack(enc_input)
            self.y[key] = mylabels
            del enc_input

        config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch,
                  'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False}

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config) # maybe assert that the order isn't changed
        logging.debug("Built classifier, starting training")
        devacc, testacc, test_preds = clf.run()
        test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test'])

        mm_acc = round(100*clf.clf.score(self.X['test_mismatched'], self.y['test_mismatched']), 2)
        mm_preds = clf.clf.predict(self.X['test_mismatched'])
        mm_preds = sort_preds(mm_preds.squeeze().tolist(), self.idxs['test_mismatched'])
        d_acc = round(100*clf.clf.score(self.X['diagnostic'], self.y['diagnostic']), 2)
        d_preds = clf.clf.predict(self.X['diagnostic'])
        d_preds = sort_preds(d_preds.squeeze().tolist(), self.idxs['diagnostic'])

        logging.debug('Dev acc : {0} Matched test acc : {1} Mismatched test acc: {2} for MNLI\n'.format(devacc, testacc, mm_acc))
        return {'devacc': devacc,
                'matched_acc': testacc, 'preds': test_preds,
                'mismatched_acc': mm_acc, 'mismatched_preds': mm_preds,
                'diagnostic_acc': d_acc, 'diagnostic_preds': d_preds,
                'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0])}
Beispiel #2
0
    def run(self, params, batcher):
        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sst_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            if key == 'test':
                sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y'], self.sst_data[key]['idx']),
                                     key=lambda z: (len(z[0]), z[1], z[2]))
                self.sst_data[key]['X'], self.sst_data[key]['y'], self.sst_data[key]['idx'] = \
                        map(list, zip(*sorted_data))
                sst_embed[key]['idx'] = self.sst_data[key]['idx']
            else:
                sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y']),
                                     key=lambda z: (len(z[0]), z[1]))
                self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))

            sst_embed[key]['X'] = []
            for ii in range(0, len(self.sst_data[key]['y']), bsize):
                batch = self.sst_data[key]['X'][ii:ii + bsize]
                embeddings = batcher(params, batch)
                sst_embed[key]['X'].append(embeddings)
            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
                             'usepytorch': params.usepytorch,
                             'classifier': params.classifier, 'noreg': False}

        clf = SplitClassifier(X={'train': sst_embed['train']['X'],
                                 'valid': sst_embed['dev']['X'],
                                 'test': sst_embed['test']['X']},
                              y={'train': sst_embed['train']['y'],
                                 'valid': sst_embed['dev']['y'],
                                 'test': sst_embed['test']['y']},
                              config=config_classifier)

        devacc, testacc, test_preds = clf.run()
        test_preds = sort_preds(test_preds.squeeze().tolist(), sst_embed['test']['idx'])
        logging.debug('\nDev acc : {0} Test acc : {1} for \
            SST {2} classification\n'.format(devacc, testacc, self.task_name))
        return {'devacc': devacc, 'acc': testacc, 'preds': test_preds,
                'ndev': len(sst_embed['dev']['X']), 'ntest': len(sst_embed['test']['X'])}
Beispiel #3
0
    def run(self, params, batcher):
        self.X, self.y, self.idxs = {}, {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []
            if key not in self.idxs:
                self.idxs[key] = []

            if key == 'test':
                if len(self.data[key]) == 3:
                    input1, input2, idxs = self.data[key]
                    mylabels = [0] * len(idxs)
                elif len(self.data[key]) == 4:
                    input1, input2, mylabels, idxs = self.data[key]
                self.idxs[key] = idxs
            else:
                input1, input2, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc2 = batcher(params, batch2)
                    enc_input.append(
                        np.hstack(
                            (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))))
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = mylabels  #[dico_label[y] for y in mylabels]

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': False
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc, test_preds = clf.run()
        test_preds = sort_preds(test_preds.squeeze().tolist(),
                                self.idxs['test'])
        logging.debug('Dev acc : {0} Test acc : {1} for QNLI\n'.format(
            devacc, testacc))
        return {
            'devacc': devacc,
            'acc': testacc,
            'preds': test_preds,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Beispiel #4
0
    def run(self, params, batcher):
        self.X, self.y, self.idxs = {}, {}, {}
        for key in self.data:
            if key not in self.X:
                self.X[key] = []
            if key not in self.y:
                self.y[key] = []
            if key not in self.idxs:
                self.idxs[key] = []

            #if len(self.data[key]) == 2:
            if key == 'test':
                if len(self.data[key]) == 2:
                    input1, idxs = self.data[key]
                    mylabels = [0] * len(idxs)
                elif len(self.data[key]) == 3:
                    input1, mylabels, idxs = self.data[key]
                self.idxs[key] = idxs
            else:
                input1, mylabels = self.data[key]
            enc_input = []
            n_labels = len(mylabels)
            for ii in range(0, n_labels, params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]

                if len(batch1) > 0:
                    enc1 = batcher(params, batch1)
                    enc_input.append(enc1)
                if (ii * params.batch_size) % (20000 * params.batch_size) == 0:
                    logging.info("PROGRESS (encoding): %.2f%%" %
                                 (100 * ii / n_labels))
            self.X[key] = np.vstack(enc_input)
            self.y[key] = mylabels

        config = {
            'nclasses': 2,
            'seed': self.seed,
            'usepytorch': params.usepytorch,
            'cudaEfficient': True,
            'nhid': params.nhid,
            'noreg': False
        }

        config_classifier = copy.deepcopy(params.classifier)
        config_classifier['max_epoch'] = 15
        config_classifier['epoch_size'] = 1
        config['classifier'] = config_classifier

        clf = SplitClassifier(self.X, self.y, config)
        devacc, testacc, test_preds = clf.run()
        dev_preds = clf.clf.predict(self.X['valid'])
        dev_mcc = matthews_corrcoef(self.y['valid'], dev_preds.squeeze())
        test_mcc = matthews_corrcoef(self.y['test'], test_preds.squeeze())
        test_preds = sort_preds(test_preds.squeeze().tolist(),
                                self.idxs['test'])
        logging.debug(
            'Dev acc : {0} Dev MCC : {3} Test acc : {1} Test MCC : {2} for CoLA\n'
            .format(devacc, testacc, test_mcc, dev_mcc))
        return {
            'devacc': devacc,
            'devmcc': dev_mcc,
            'acc': testacc,
            'mcc': test_mcc,
            'preds': test_preds,
            'ndev': len(self.data['valid'][0]),
            'ntest': len(self.data['test'][0])
        }
Beispiel #5
0
    def run(self, params, batcher):
        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
        bsize = params.batch_size

        for key in self.sick_data:
            logging.info('Computing embedding for {0}'.format(key))
            # Sort to reduce padding
            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
                                       self.sick_data[key]['X_B'],
                                       self.sick_data[key]['y']),
                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))

            if key == 'test':
                sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
                                           self.sick_data[key]['X_B'],
                                           self.sick_data[key]['y'],
                                           self.sick_data[key]['idx']),
                                       key=lambda z:
                                       (len(z[0]), len(z[1]), z[2], z[3]))
                self.sick_data[key]['X_A'] = [
                    x for (x, y, z, w) in sorted_corpus
                ]
                self.sick_data[key]['X_B'] = [
                    y for (x, y, z, w) in sorted_corpus
                ]
                self.sick_data[key]['y'] = [
                    z for (x, y, z, w) in sorted_corpus
                ]
                self.sick_data[key]['idx'] = [
                    w for (x, y, z, w) in sorted_corpus
                ]
                sick_embed[key]['idx'] = self.sick_data[key]['idx']
            else:
                sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
                                           self.sick_data[key]['X_B'],
                                           self.sick_data[key]['y']),
                                       key=lambda z:
                                       (len(z[0]), len(z[1]), z[2]))
                self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
                self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
                self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]

            for txt_type in ['X_A', 'X_B']:
                sick_embed[key][txt_type] = []
                for ii in range(0, len(self.sick_data[key]['y']), bsize):
                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
                    embeddings = batcher(params, batch)
                    sick_embed[key][txt_type].append(embeddings)
                sick_embed[key][txt_type] = np.vstack(
                    sick_embed[key][txt_type])
            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
            logging.info('Computed {0} embeddings'.format(key))

        # Train
        trainA = sick_embed['train']['X_A']
        trainB = sick_embed['train']['X_B']
        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
        trainY = self.encode_labels(self.sick_data['train']['y'])

        # Dev
        devA = sick_embed['dev']['X_A']
        devB = sick_embed['dev']['X_B']
        devF = np.c_[np.abs(devA - devB), devA * devB]
        devY = self.encode_labels(self.sick_data['dev']['y'])

        # Test
        testA = sick_embed['test']['X_A']
        testB = sick_embed['test']['X_B']
        testF = np.c_[np.abs(testA - testB), testA * testB]
        testY = self.encode_labels(self.sick_data['test']['y'])

        config = {'seed': self.seed, 'nclasses': 5}
        clf = RelatednessPytorch(train={
            'X': trainF,
            'y': trainY
        },
                                 valid={
                                     'X': devF,
                                     'y': devY
                                 },
                                 test={
                                     'X': testF,
                                     'y': testY
                                 },
                                 devscores=self.sick_data['dev']['y'],
                                 config=config)

        devpr, yhat, dev_preds = clf.run()

        dev_sr = spearmanr(dev_preds, self.sick_data['dev']['y'])[0]
        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
        se = mean_squared_error(yhat, self.sick_data['test']['y'])
        test_preds = sort_preds(yhat.squeeze().tolist(),
                                sick_embed['test']['idx'])
        logging.debug('Dev : Pearson {0} Spearman {1}'.format(devpr, dev_sr))
        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
                       for SICK Relatedness\n'.format(pr, sr, se))

        return {
            'devpearson': devpr,
            'devspearman': dev_sr,
            'pearson': pr,
            'spearman': sr,
            'mse': se,
            'preds': test_preds,
            'ndev': len(devA),
            'ntest': len(testA)
        }