def run(self, params, batcher): self.X, self.y, self.idxs = {}, {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] if key not in self.idxs: self.idxs[key] = [] if "test" in key or "diagnostic" in key: input1, input2, mylabels, idxs = self.data[key] self.idxs[key] = idxs else: input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii*params.batch_size) % (20000*params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) logging.debug("Finished encoding MNLI") self.X[key] = np.vstack(enc_input) self.y[key] = mylabels del enc_input config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False} config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) # maybe assert that the order isn't changed logging.debug("Built classifier, starting training") devacc, testacc, test_preds = clf.run() test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test']) mm_acc = round(100*clf.clf.score(self.X['test_mismatched'], self.y['test_mismatched']), 2) mm_preds = clf.clf.predict(self.X['test_mismatched']) mm_preds = sort_preds(mm_preds.squeeze().tolist(), self.idxs['test_mismatched']) d_acc = round(100*clf.clf.score(self.X['diagnostic'], self.y['diagnostic']), 2) d_preds = clf.clf.predict(self.X['diagnostic']) d_preds = sort_preds(d_preds.squeeze().tolist(), self.idxs['diagnostic']) logging.debug('Dev acc : {0} Matched test acc : {1} Mismatched test acc: {2} for MNLI\n'.format(devacc, testacc, mm_acc)) return {'devacc': devacc, 'matched_acc': testacc, 'preds': test_preds, 'mismatched_acc': mm_acc, 'mismatched_preds': mm_preds, 'diagnostic_acc': d_acc, 'diagnostic_preds': d_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0])}
def run(self, params, batcher): sst_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sst_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding if key == 'test': sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y'], self.sst_data[key]['idx']), key=lambda z: (len(z[0]), z[1], z[2])) self.sst_data[key]['X'], self.sst_data[key]['y'], self.sst_data[key]['idx'] = \ map(list, zip(*sorted_data)) sst_embed[key]['idx'] = self.sst_data[key]['idx'] else: sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y']), key=lambda z: (len(z[0]), z[1])) self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) sst_embed[key]['X'] = [] for ii in range(0, len(self.sst_data[key]['y']), bsize): batch = self.sst_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) sst_embed[key]['X'].append(embeddings) sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'noreg': False} clf = SplitClassifier(X={'train': sst_embed['train']['X'], 'valid': sst_embed['dev']['X'], 'test': sst_embed['test']['X']}, y={'train': sst_embed['train']['y'], 'valid': sst_embed['dev']['y'], 'test': sst_embed['test']['y']}, config=config_classifier) devacc, testacc, test_preds = clf.run() test_preds = sort_preds(test_preds.squeeze().tolist(), sst_embed['test']['idx']) logging.debug('\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n'.format(devacc, testacc, self.task_name)) return {'devacc': devacc, 'acc': testacc, 'preds': test_preds, 'ndev': len(sst_embed['dev']['X']), 'ntest': len(sst_embed['test']['X'])}
def run(self, params, batcher): self.X, self.y, self.idxs = {}, {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] if key not in self.idxs: self.idxs[key] = [] if key == 'test': if len(self.data[key]) == 3: input1, input2, idxs = self.data[key] mylabels = [0] * len(idxs) elif len(self.data[key]) == 4: input1, input2, mylabels, idxs = self.data[key] self.idxs[key] = idxs else: input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append( np.hstack( (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = mylabels #[dico_label[y] for y in mylabels] config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, test_preds = clf.run() test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test']) logging.debug('Dev acc : {0} Test acc : {1} for QNLI\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'preds': test_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): self.X, self.y, self.idxs = {}, {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] if key not in self.idxs: self.idxs[key] = [] #if len(self.data[key]) == 2: if key == 'test': if len(self.data[key]) == 2: input1, idxs = self.data[key] mylabels = [0] * len(idxs) elif len(self.data[key]) == 3: input1, mylabels, idxs = self.data[key] self.idxs[key] = idxs else: input1, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] if len(batch1) > 0: enc1 = batcher(params, batch1) enc_input.append(enc1) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = mylabels config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, test_preds = clf.run() dev_preds = clf.clf.predict(self.X['valid']) dev_mcc = matthews_corrcoef(self.y['valid'], dev_preds.squeeze()) test_mcc = matthews_corrcoef(self.y['test'], test_preds.squeeze()) test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test']) logging.debug( 'Dev acc : {0} Dev MCC : {3} Test acc : {1} Test MCC : {2} for CoLA\n' .format(devacc, testacc, test_mcc, dev_mcc)) return { 'devacc': devacc, 'devmcc': dev_mcc, 'acc': testacc, 'mcc': test_mcc, 'preds': test_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): sick_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sick_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], self.sick_data[key]['X_B'], self.sick_data[key]['y']), key=lambda z: (len(z[0]), len(z[1]), z[2])) if key == 'test': sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], self.sick_data[key]['X_B'], self.sick_data[key]['y'], self.sick_data[key]['idx']), key=lambda z: (len(z[0]), len(z[1]), z[2], z[3])) self.sick_data[key]['X_A'] = [ x for (x, y, z, w) in sorted_corpus ] self.sick_data[key]['X_B'] = [ y for (x, y, z, w) in sorted_corpus ] self.sick_data[key]['y'] = [ z for (x, y, z, w) in sorted_corpus ] self.sick_data[key]['idx'] = [ w for (x, y, z, w) in sorted_corpus ] sick_embed[key]['idx'] = self.sick_data[key]['idx'] else: sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], self.sick_data[key]['X_B'], self.sick_data[key]['y']), key=lambda z: (len(z[0]), len(z[1]), z[2])) self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] for txt_type in ['X_A', 'X_B']: sick_embed[key][txt_type] = [] for ii in range(0, len(self.sick_data[key]['y']), bsize): batch = self.sick_data[key][txt_type][ii:ii + bsize] embeddings = batcher(params, batch) sick_embed[key][txt_type].append(embeddings) sick_embed[key][txt_type] = np.vstack( sick_embed[key][txt_type]) sick_embed[key]['y'] = np.array(self.sick_data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) # Train trainA = sick_embed['train']['X_A'] trainB = sick_embed['train']['X_B'] trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] trainY = self.encode_labels(self.sick_data['train']['y']) # Dev devA = sick_embed['dev']['X_A'] devB = sick_embed['dev']['X_B'] devF = np.c_[np.abs(devA - devB), devA * devB] devY = self.encode_labels(self.sick_data['dev']['y']) # Test testA = sick_embed['test']['X_A'] testB = sick_embed['test']['X_B'] testF = np.c_[np.abs(testA - testB), testA * testB] testY = self.encode_labels(self.sick_data['test']['y']) config = {'seed': self.seed, 'nclasses': 5} clf = RelatednessPytorch(train={ 'X': trainF, 'y': trainY }, valid={ 'X': devF, 'y': devY }, test={ 'X': testF, 'y': testY }, devscores=self.sick_data['dev']['y'], config=config) devpr, yhat, dev_preds = clf.run() dev_sr = spearmanr(dev_preds, self.sick_data['dev']['y'])[0] pr = pearsonr(yhat, self.sick_data['test']['y'])[0] sr = spearmanr(yhat, self.sick_data['test']['y'])[0] se = mean_squared_error(yhat, self.sick_data['test']['y']) test_preds = sort_preds(yhat.squeeze().tolist(), sick_embed['test']['idx']) logging.debug('Dev : Pearson {0} Spearman {1}'.format(devpr, dev_sr)) logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \ for SICK Relatedness\n'.format(pr, sr, se)) return { 'devpearson': devpr, 'devspearman': dev_sr, 'pearson': pr, 'spearman': sr, 'mse': se, 'preds': test_preds, 'ndev': len(devA), 'ntest': len(testA) }