def run(self, params, batcher): self.X, self.y = {}, {} dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) enc1 = batcher(params, input1) enc2 = batcher(params, input2) print(enc2.shape) enc_input = np.hstack((enc1, enc2, np.abs(enc1 - enc2))) # enc_input = np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2))) # for ii in range(0, n_labels, params.batch_size): # batch1 = input1[ii:ii + params.batch_size] # batch2 = input2[ii:ii + params.batch_size] # # if len(batch1) == len(batch2) and len(batch1) > 0: # enc1 = batcher(params, batch1) # enc2 = batcher(params, batch2) # enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, # np.abs(enc1 - enc2)))) # if (ii*params.batch_size) % (20000*params.batch_size) == 0: # logging.info("PROGRESS (encoding): %.2f%%" % # (100 * ii / n_labels)) # self.X[key] = np.vstack(enc_input) self.X[key] = enc_input self.y[key] = [dico_label[y] for y in mylabels] config = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': 1000, 'noreg': True } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config_classifier['nhid'] = 1000 config['classifier'] = config_classifier config['nhid'] = 1000 clf = SplitClassifier(self.X, self.y, config) devacc, testacc = clf.run() logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): sick_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sick_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], self.sick_data[key]['X_B'], self.sick_data[key]['y']), key=lambda z: (len(z[0]), len(z[1]), z[2])) self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] for txt_type in ['X_A', 'X_B']: params.batcher_dataset = f"{key}_{txt_type}" sick_embed[key][txt_type] = [] for ii in range(0, len(self.sick_data[key]['y']), bsize): params.batcher_offset = str(ii) batch = self.sick_data[key][txt_type][ii:ii + bsize] embeddings = batcher(params, batch) sick_embed[key][txt_type].append(embeddings) sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type]) logging.info('Computed {0} embeddings'.format(key)) # Train trainA = sick_embed['train']['X_A'] trainB = sick_embed['train']['X_B'] trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] trainY = np.array(self.sick_data['train']['y']) # Dev devA = sick_embed['dev']['X_A'] devB = sick_embed['dev']['X_B'] devF = np.c_[np.abs(devA - devB), devA * devB] devY = np.array(self.sick_data['dev']['y']) # Test testA = sick_embed['test']['X_A'] testB = sick_embed['test']['X_B'] testF = np.c_[np.abs(testA - testB), testA * testB] testY = np.array(self.sick_data['test']['y']) config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid} clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF}, y={'train': trainY, 'valid': devY, 'test': testY}, config=config) devacc, testacc = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} for \ SICK entailment\n'.format(devacc, testacc)) return {'devacc': devacc, 'acc': testacc, 'ndev': len(devA), 'ntest': len(testA)}
def run(self, params, batcher): sst_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sst_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding zipped_data = sorted(enumerate( zip(self.sst_data[key]['X'], self.sst_data[key]['y'])), key=lambda z: (len(z[1][0]), z[1][1])) if key == 'test': sorted_test_indices = [i for (i, z) in zipped_data] self.sst_data[key]['X'] = [x for (i, (x, y)) in zipped_data] self.sst_data[key]['y'] = [y for (i, (x, y)) in zipped_data] sst_embed[key]['X'] = [] for ii in range(0, len(self.sst_data[key]['y']), bsize): batch = self.sst_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) sst_embed[key]['X'].append(embeddings) sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier } clf = SplitClassifier(X={ 'train': sst_embed['train']['X'], 'valid': sst_embed['dev']['X'], 'test': sst_embed['test']['X'] }, y={ 'train': sst_embed['train']['y'], 'valid': sst_embed['dev']['y'], 'test': sst_embed['test']['y'] }, config=config_classifier) devacc, testacc, yhat_sorted = clf.run() yhat = [None] * len(yhat_sorted) for (i, y) in enumerate(yhat_sorted): yhat[sorted_test_indices[i]] = y logging.debug('\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n'.format(devacc, testacc, self.task_name)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(sst_embed['dev']['X']), 'ntest': len(sst_embed['test']['X']), 'yhat': yhat, 'metadata': self.metadata }
def run(self, params, batcher): self.X, self.y, self.idxs = {}, {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] if key not in self.idxs: self.idxs[key] = [] if "test" in key or "diagnostic" in key: input1, input2, mylabels, idxs = self.data[key] self.idxs[key] = idxs else: input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii*params.batch_size) % (20000*params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) logging.debug("Finished encoding MNLI") self.X[key] = np.vstack(enc_input) self.y[key] = mylabels del enc_input config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False} config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) # maybe assert that the order isn't changed logging.debug("Built classifier, starting training") devacc, testacc, test_preds = clf.run() test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test']) mm_acc = round(100*clf.clf.score(self.X['test_mismatched'], self.y['test_mismatched']), 2) mm_preds = clf.clf.predict(self.X['test_mismatched']) mm_preds = sort_preds(mm_preds.squeeze().tolist(), self.idxs['test_mismatched']) d_acc = round(100*clf.clf.score(self.X['diagnostic'], self.y['diagnostic']), 2) d_preds = clf.clf.predict(self.X['diagnostic']) d_preds = sort_preds(d_preds.squeeze().tolist(), self.idxs['diagnostic']) logging.debug('Dev acc : {0} Matched test acc : {1} Mismatched test acc: {2} for MNLI\n'.format(devacc, testacc, mm_acc)) return {'devacc': devacc, 'matched_acc': testacc, 'preds': test_preds, 'mismatched_acc': mm_acc, 'mismatched_preds': mm_preds, 'diagnostic_acc': d_acc, 'diagnostic_preds': d_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0])}
def run(self, params, batcher): self.X, self.y = {}, {} dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append( np.hstack( (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = [dico_label[y] for y in mylabels] config = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': True } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, yhat_sorted = clf.run() yhat = [None] * len(yhat_sorted) for (i, y) in enumerate(yhat_sorted): yhat[self.sorted_test_indices[i]] = y logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]), 'yhat': yhat, 'metadata': self.metadata }
def run(self, params, batcher): task_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size logging.info('Computing embeddings for train/dev/test') for key in self.task_data: # Sort to reduce padding sorted_data = sorted(zip(self.task_data[key]['X'], self.task_data[key]['y']), key=lambda z: (len(z[0]), z[1])) self.task_data[key]['X'], self.task_data[key]['y'] = map( list, zip(*sorted_data)) task_embed[key]['X'] = [] for ii in range(0, len(self.task_data[key]['y']), bsize): batch = self.task_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) task_embed[key]['X'].append(embeddings) task_embed[key]['X'] = np.vstack(task_embed[key]['X']) task_embed[key]['y'] = np.array(self.task_data[key]['y']) logging.info('Computed embeddings') config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'noreg': params.noreg } if self.task == "WordContent" and params.classifier['nhid'] > 0: config_classifier = copy.deepcopy(config_classifier) config_classifier['classifier']['nhid'] = 0 print(params.classifier['nhid']) clf = SplitClassifier(X={ 'train': task_embed['train']['X'], 'valid': task_embed['dev']['X'], 'test': task_embed['test']['X'] }, y={ 'train': task_embed['train']['y'], 'valid': task_embed['dev']['y'], 'test': task_embed['test']['y'] }, config=config_classifier) devacc, testacc = clf.run() logging.debug( '\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper())) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(task_embed['dev']['X']), 'ntest': len(task_embed['test']['X']) }
def run(self, params, batcher): sst_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sst_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y']), key=lambda z: (len(z[0]), z[1])) self.sst_data[key]['X'], self.sst_data[key]['y'] = map( list, zip(*sorted_data)) sst_embed[key]['X'] = [] for ii in range(0, len(self.sst_data[key]['y']), bsize): batch = self.sst_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) sst_embed[key]['X'].append(embeddings) sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier } clf = SplitClassifier(X={ 'train': sst_embed['train']['X'], 'valid': sst_embed['dev']['X'], 'test': sst_embed['test']['X'] }, y={ 'train': sst_embed['train']['y'], 'valid': sst_embed['dev']['y'], 'test': sst_embed['test']['y'] }, config=config_classifier) classifier, devacc, testacc = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n'.format(devacc, testacc, self.task_name)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(sst_embed['dev']['X']), 'ntest': len(sst_embed['test']['X']), 'classifier': classifier, 'X_train': sst_embed['train']['X'], #'Y': np.concatenate((sst_embed['train']['y'], sst_embed['dev']['y'], sst_embed['test']['y']), axis=0) 'X_test': sst_embed['test']['X'], 'text': self.sst_data['train']['X'] }
def run(self, params, batcher): sst_embed = {"train": {}, "dev": {}, "test": {}} bsize = params.batch_size for key in self.sst_data: logging.info("Computing embedding for {0}".format(key)) # Sort to reduce padding sorted_data = sorted( zip(self.sst_data[key]["X"], self.sst_data[key]["y"]), key=lambda z: (len(z[0]), z[1]), ) self.sst_data[key]["X"], self.sst_data[key]["y"] = map( list, zip(*sorted_data)) sst_embed[key]["X"] = [] for ii in range(0, len(self.sst_data[key]["y"]), bsize): batch = self.sst_data[key]["X"][ii:ii + bsize] embeddings = batcher(params, batch) sst_embed[key]["X"].append(embeddings) sst_embed[key]["X"] = np.vstack(sst_embed[key]["X"]) sst_embed[key]["y"] = np.array(self.sst_data[key]["y"]) logging.info("Computed {0} embeddings".format(key)) config_classifier = { "nclasses": self.nclasses, "seed": self.seed, "usepytorch": params.usepytorch, "classifier": params.classifier, } clf = SplitClassifier( X={ "train": sst_embed["train"]["X"], "valid": sst_embed["dev"]["X"], "test": sst_embed["test"]["X"], }, y={ "train": sst_embed["train"]["y"], "valid": sst_embed["dev"]["y"], "test": sst_embed["test"]["y"], }, config=config_classifier, ) devacc, testacc = clf.run() devacc = devacc.tolist() logging.debug("\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n".format(devacc, testacc, self.task_name)) return { "devacc": devacc, "acc": testacc, "ndev": len(sst_embed["dev"]["X"]), "ntest": len(sst_embed["test"]["X"]), }
def run(self, params, batcher): self.X, self.y = {}, {} dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append( np.hstack( (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = [dico_label[y] for y in mylabels] config_classifier = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'classifier': params.classifier, 'nhid': params.nhid, 'maxepoch': 40, 'nepoches': 4, 'noreg': False } clf = SplitClassifier(self.X, self.y, config_classifier) devacc, testacc = clf.run() logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): self.X, self.y = {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii*params.batch_size) % (20000*params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) #self.y[key] = [dico_label[y] for y in mylabels] self.y[key] = mylabels config = {'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': True} config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, test_preds = clf.run() logging.debug('Dev acc : {0} Test acc : {1} for ANLI\n'.format(devacc, testacc)) return {'devacc': devacc, 'acc': testacc, 'preds': test_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0])}
def run(self, params, batcher): self.X, self.y = {}, {} dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels, ids = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append( np.hstack( (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii * params.batch_size) % (200 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) try: self.y[key] = [dico_label[y] for y in mylabels] except: logging.info(' key error') continue config = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': True } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, preds, probs = clf.run() print(self.data['test'][0]) print(self.data['test'][1]) print(preds) print(probs) pp = [] i = 0 while i < 52: pp.append([probs[i], probs[i + 1], probs[i + 2]]) i = i + 3 dico_label = {0: 'entailment', 1: 'neutral', 2: 'contradiction'} predictions = [] for i in preds: predictions.append(dico_label[i[0]]) for i, j, k, l in zip(pp, predictions, self.data['test'][0], self.data['test'][1]): print(" ".join(k), "\t", " ".join(l), "\t", j, "\t", i) logging.debug('Dev acc : {0} Test acc : {1} for MedNLI\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): self.X, self.y, self.idxs = {}, {}, {} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] if key not in self.idxs: self.idxs[key] = [] #if len(self.data[key]) == 2: if key == 'test': if len(self.data[key]) == 2: input1, idxs = self.data[key] mylabels = [0] * len(idxs) elif len(self.data[key]) == 3: input1, mylabels, idxs = self.data[key] self.idxs[key] = idxs else: input1, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] if len(batch1) > 0: enc1 = batcher(params, batch1) enc_input.append(enc1) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = mylabels config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': False } config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(self.X, self.y, config) devacc, testacc, test_preds = clf.run() dev_preds = clf.clf.predict(self.X['valid']) dev_mcc = matthews_corrcoef(self.y['valid'], dev_preds.squeeze()) test_mcc = matthews_corrcoef(self.y['test'], test_preds.squeeze()) test_preds = sort_preds(test_preds.squeeze().tolist(), self.idxs['test']) logging.debug( 'Dev acc : {0} Dev MCC : {3} Test acc : {1} Test MCC : {2} for CoLA\n' .format(devacc, testacc, test_mcc, dev_mcc)) return { 'devacc': devacc, 'devmcc': dev_mcc, 'acc': testacc, 'mcc': test_mcc, 'preds': test_preds, 'ndev': len(self.data['valid'][0]), 'ntest': len(self.data['test'][0]) }
def run(self, params, batcher): if (params.train is not None and params.train == False): sst_embed = {'dev': {}, 'test': {}} else: sst_embed = {'train': {}, 'dev': {}, 'test': {}} train_file_x = 'embeddings/trainx_' + params.model_name + "_sst.csv" train_file_y = 'embeddings/trainy_' + params.model_name + "_sst.csv" test_file_x = 'embeddings/testx_' + params.model_name + "_sst.csv" test_file_y = 'embeddings/testy_' + params.model_name + "_sst.csv" dev_file_x = 'embeddings/devx_' + params.model_name + "_sst.csv" dev_file_y = 'embeddings/devy_' + params.model_name + "_sst.csv" bsize = params.batch_size self.params = params self.adversarialFunc = params.adversarialFunc # for key in self.sst_data: # logging.info('Computing embedding for {0}'.format(key)) # # Sort to reduce padding # sorted_data = sorted(zip(self.sst_data[key]['X'], # self.sst_data[key]['y']), # key=lambda z: (len(z[0]), z[1])) # self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) # # sst_embed[key]['X'] = [] # for ii in range(0, len(self.sst_data[key]['y']), bsize): # n = len(self.sst_data[key]['y'])/bsize # # if ((ii/bsize)*100/n) % 10 == 0: # print("%d percent done out of %d"%( ((ii/bsize)*100/n), len(self.sst_data[key]['y']))) # batch = self.sst_data[key]['X'][ii:ii + bsize] # embeddings = batcher(params, batch) # sst_embed[key]['X'].append(embeddings) # # logging.info('computed batch {0}, out of total {1}'.format(ii,bsize)) # sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) # sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) # logging.info('Computed {0} embeddings'.format(key)) # # pickle.dump(sst_embed['train']['X'], open(train_file_x, 'wb')) # pickle.dump(sst_embed['train']['y'], open(train_file_y, 'wb')) # # pickle.dump(sst_embed['test']['X'], open(test_file_x, 'wb')) # pickle.dump(sst_embed['test']['y'], open(test_file_y, 'wb')) # pickle.dump(sst_embed['dev']['X'], open(dev_file_x, 'wb')) # pickle.dump(sst_embed['dev']['y'], open(dev_file_y, 'wb')) logging.info("dumped files") sst_embed['train']['X'] = pickle.load(open(train_file_x, 'rb')) sst_embed['train']['y'] = pickle.load(open(train_file_y, 'rb')) sst_embed['test']['X'] = pickle.load(open(test_file_x, 'rb')) sst_embed['test']['y'] = pickle.load(open(test_file_y, 'rb')) sst_embed['dev']['X'] = pickle.load(open(dev_file_x, 'rb')) sst_embed['dev']['y'] = pickle.load(open(dev_file_y, 'rb')) logging.info("loaded sst embeddings.") # print "printing to check if wordvecs fored correct\n" # # for word in self.sst_data['test']['X'][0]: # print word, "-"*30 # print params.word_vec[word][:20] # print "sent embedding", "-"*30 # print sst_embed['test']['X'][0][:20] # print "\n\n" config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'adversarial_sample_generator': self.generate_adv_samples if self.adversarialFunc is not None else None } if params.train is not None and params.train == False: X = { 'train': {}, 'valid': sst_embed['dev']['X'], 'test': sst_embed['test']['X'] } y = { 'train': {}, 'valid': sst_embed['dev']['y'], 'test': sst_embed['test']['y'] } else: X = { 'train': sst_embed['train']['X'], 'valid': sst_embed['dev']['X'], 'test': sst_embed['test']['X'] } y = { 'train': sst_embed['train']['y'], 'valid': sst_embed['dev']['y'], 'test': sst_embed['test']['y'] } # X = {'train': {}, 'valid': {}, 'test': {}} # y = {'train': {}, 'valid': {}, 'test': {}} # # for key in sst_embed.keys(): # X[key] = sst_embed.get(key)['X'] # y[key] = sst_embed.get(key)['y'] # X = {'train': {}, # 'valid': sst_embed['dev']['X'], # 'test': sst_embed['test']['X']} # y = {'train': {}, # 'valid': sst_embed['dev']['y'], # 'test': sst_embed['test']['y']} clf = SplitClassifier(X, y, config=config_classifier, test_dataX=self.sst_data['test']['X'], test_dataY=self.sst_data['test']['y']) params.task_name = "sst" devacc, testacc, adv_results = clf.run(params) logging.debug('\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n'.format(devacc, testacc, self.task_name)) results = dict() results['task_results'] = { 'devacc': devacc, 'acc': testacc, 'ndev': len(sst_embed['dev']['X']), 'ntest': len(sst_embed['test']['X']) } results['adv_results'] = adv_results print("added adv results to pass back") return results
def run(self, params, batcher): sst_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size for key in self.sst_data: logging.info('Computing embedding for {0}'.format(key)) # Sort to reduce padding sorted_data = sorted(zip(self.sst_data[key]['X'], self.sst_data[key]['y']), key=lambda z: (len(z[0]), z[1])) self.sst_data[key]['X'], self.sst_data[key]['y'] = map( list, zip(*sorted_data)) sst_embed[key]['X'] = [] for ii in range(0, len(self.sst_data[key]['y']), bsize): batch = self.sst_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) sst_embed[key]['X'].append(embeddings) sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) logging.info('Computed {0} embeddings'.format(key)) dev_length = len(sst_embed['dev']['X']) test_length = len(sst_embed['test']['X']) embeddings = [] index = 0 trainX_indexes = [] for embedding in sst_embed['train']['X']: embeddings.append(embedding) trainX_indexes.append(index) index += 1 trainX_indexes = np.vstack(trainX_indexes) del sst_embed['train']['X'] trainy_indexes = sst_embed['train']['y'] del sst_embed['train']['y'] devX_indexes = [] for embedding in sst_embed['dev']['X']: embeddings.append(embedding) devX_indexes.append(index) index += 1 devX_indexes = np.vstack(devX_indexes) del sst_embed['dev']['X'] devy_indexes = sst_embed['dev']['y'] del sst_embed['dev']['y'] testX_indexes = [] for embedding in sst_embed['test']['X']: embeddings.append(embedding) testX_indexes.append(index) index += 1 testX_indexes = np.vstack(testX_indexes) del sst_embed['test']['X'] testy_indexes = sst_embed['test']['y'] del sst_embed['test']['y'] sst_embed = None embeddings = np.vstack(embeddings) config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier } clf = SplitClassifier(X={ 'train': trainX_indexes, 'valid': devX_indexes, 'test': testX_indexes }, y={ 'train': trainy_indexes, 'valid': devy_indexes, 'test': testy_indexes }, embeddings=embeddings, config=config_classifier) devacc, testacc = clf.run() logging.debug('\nDev acc : {0} Test acc : {1} for \ SST {2} classification\n'.format(devacc, testacc, self.task_name)) return { 'devacc': devacc, 'acc': testacc, 'ndev': dev_length, 'ntest': test_length }
def run(self, params, batcher): task_embed = {"train": {}, "dev": {}, "test": {}} bsize = params.batch_size logging.info("Computing embeddings for train/dev/test") for key in self.task_data: # Sort to reduce padding sorted_data = sorted( zip( self.task_data[key]["X"], self.task_data[key]["id"], self.task_data[key]["y"], self.task_data[key]["head"], self.task_data[key]["tail"], self.task_data[key]["ner"], self.task_data[key]["pos"], self.task_data[key]["dep"], self.task_data[key]["dep_head"], ), key=lambda z: (len(z[0]), z[1]), ) ( self.task_data[key]["X"], self.task_data[key]["id"], self.task_data[key]["y"], self.task_data[key]["head"], self.task_data[key]["tail"], self.task_data[key]["ner"], self.task_data[key]["pos"], self.task_data[key]["dep"], self.task_data[key]["dep_head"], ) = map(list, zip(*sorted_data)) task_embed[key]["X"] = [] for ii in range(0, len(self.task_data[key]["y"]), bsize): batch = self.task_data[key]["X"][ii:ii + bsize] id_ = self.task_data[key]["id"][ii:ii + bsize] id_ = id_ if id_ != "None" else None head = self.task_data[key]["head"][ii:ii + bsize] tail = self.task_data[key]["tail"][ii:ii + bsize] ner = self.task_data[key]["ner"][ii:ii + bsize] pos = self.task_data[key]["pos"][ii:ii + bsize] dep = self.task_data[key]["dep"][ii:ii + bsize] dep_head = self.task_data[key]["dep_head"][ii:ii + bsize] embeddings = batcher(params, batch, head, tail, ner, pos, dep, dep_head, id_) task_embed[key]["X"].append(embeddings) task_embed[key]["X"] = np.vstack(task_embed[key]["X"]) task_embed[key]["y"] = np.array(self.task_data[key]["y"]) logging.info("Computed embeddings") config_classifier = { "nclasses": self.nclasses, "seed": self.seed, "usepytorch": params.usepytorch, "classifier": params.classifier, } # if self.task == "WordContent" and params.classifier["nhid"] > 0: # config_classifier = copy.deepcopy(config_classifier) # config_classifier["classifier"]["nhid"] = 0 # print(params.classifier["nhid"]) clf = SplitClassifier( X={ "train": task_embed["train"]["X"], "valid": task_embed["dev"]["X"], "test": task_embed["test"]["X"], }, y={ "train": task_embed["train"]["y"], "valid": task_embed["dev"]["y"], "test": task_embed["test"]["y"], }, config=config_classifier, ) devacc, testacc = clf.run() logging.debug( "\nDev acc : %.1f Test acc : %.1f for %s classification\n" % (devacc, testacc, self.task.upper())) return { "devacc": devacc, "acc": testacc, "ndev": len(task_embed["dev"]["X"]), "ntest": len(task_embed["test"]["X"]), }
def run(self, params, batcher): rqe_embed = {'dev': {}, 'train': {}, 'test': {}} for key in self.rqe_data: print('Computing embedding for {0}'.format(key)) # Sort to reduce padding text_data = {} sorted_corpus = sorted(zip(self.rqe_data[key]['chq'], self.rqe_data[key]['faq'], self.rqe_data[key]['label'], self.rqe_data[key]['pid']), key=lambda z: (len(z[0]), len(z[1]), z[2])) text_data['chq'] = [x for (x, y, z, w) in sorted_corpus] text_data['faq'] = [y for (x, y, z, w) in sorted_corpus] text_data['label'] = [z for (x, y, z, w) in sorted_corpus] text_data['pid'] = [w for (x, y, z, w) in sorted_corpus] for txt_type in ['chq', 'faq']: rqe_embed[key][txt_type] = [] for ii in range(0, len(text_data['label']), params.batch_size): batch = text_data[txt_type][ii:ii + params.batch_size] #print(batch) embeddings = batcher(params, batch) rqe_embed[key][txt_type].append(embeddings) rqe_embed[key][txt_type] = np.vstack(rqe_embed[key][txt_type]) rqe_embed[key]['label'] = np.array(text_data['label']) logging.info('Computed {0} embeddings'.format(key)) # Train trainC = rqe_embed['train']['chq'] trainF = rqe_embed['train']['faq'] #print(trainC.shape,trainF.shape,(np.abs(trainC - trainF)).shape, (trainC * trainF).shape) #trainCF = np.c_[trainC, trainF,np.abs(trainC - trainF), (trainC * trainF)] trainCF = np.hstack( (trainC, trainF, trainC * trainF, np.abs(trainC - trainF))) trainY = rqe_embed['train']['label'] print('Done embdding for train') # Test testC = rqe_embed['test']['chq'] testF = rqe_embed['test']['faq'] #testCF = np.c_[testC, testF, np.abs(testC - testF), testC * testF] testCF = np.hstack( (testC, testF, testC * testF, np.abs(testC - testF))) testY = rqe_embed['test']['label'] print('Done embdding for test') # dev devC = rqe_embed['dev']['chq'] devF = rqe_embed['dev']['faq'] #testCF = np.c_[testC, testF, np.abs(testC - testF), testC * testF] devCF = np.hstack((devC, devF, devC * devF, np.abs(devC - devF))) devY = rqe_embed['dev']['label'] print('Done embdding for dev') config = { 'nclasses': 2, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier, 'nhid': params.nhid, 'kfold': params.kfold } print(len(devCF), len(devY), len(testCF), len(testY)) clf = SplitClassifier(X={ 'train': trainCF, 'valid': devCF, 'test': testCF }, y={ 'train': trainY, 'valid': devY, 'test': testY }, config=config) devacc, testacc, yhat = clf.run() pred = [] print(text_data['pid']) for i in yhat: pred.append(i) print(pred) logging.debug('Dev acc : {0} Test acc {1}; for RQE.\n'.format( devacc, testacc)) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(trainCF), 'ntest': len(testCF) }
def run(self, params, batcher): task_embed = {'train': {}, 'dev': {}, 'test': {}} bsize = params.batch_size logging.info('Computing embeddings for train/dev/test') for key in self.task_data: # Sort to reduce padding zipped_data = sorted(enumerate( zip(self.task_data[key]['X'], self.task_data[key]['y'])), key=lambda z: (len(z[1][0]), z[1][1])) if key == 'test': sorted_test_indices = [i for (i, z) in zipped_data] self.task_data[key]['X'] = [x for (i, (x, y)) in zipped_data] self.task_data[key]['y'] = [y for (i, (x, y)) in zipped_data] task_embed[key]['X'] = [] for ii in range(0, len(self.task_data[key]['y']), bsize): batch = self.task_data[key]['X'][ii:ii + bsize] embeddings = batcher(params, batch) task_embed[key]['X'].append(embeddings) task_embed[key]['X'] = np.vstack(task_embed[key]['X']) task_embed[key]['y'] = np.array(self.task_data[key]['y']) logging.info('Computed embeddings') config_classifier = { 'nclasses': self.nclasses, 'seed': self.seed, 'usepytorch': params.usepytorch, 'classifier': params.classifier } if self.task == "WordContent" and params.classifier['nhid'] > 0: config_classifier = copy.deepcopy(config_classifier) config_classifier['classifier']['nhid'] = 0 print(params.classifier['nhid']) clf = SplitClassifier(X={ 'train': task_embed['train']['X'], 'valid': task_embed['dev']['X'], 'test': task_embed['test']['X'] }, y={ 'train': task_embed['train']['y'], 'valid': task_embed['dev']['y'], 'test': task_embed['test']['y'] }, config=config_classifier) devacc, testacc, yhat_sorted = clf.run() yhat = [None] * len(yhat_sorted) for (i, y) in enumerate(yhat_sorted): yhat[sorted_test_indices[i]] = y logging.debug( '\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper())) return { 'devacc': devacc, 'acc': testacc, 'ndev': len(task_embed['dev']['X']), 'ntest': len(task_embed['test']['X']), 'metadata': self.metadata, 'yhat': yhat }
def run(self, params, batcher): train_embeddings, valid_embeddings, test_embeddings = [], [], [] # Sort to reduce padding sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), key=lambda z: (len(z[0]), z[1])) train_samples = [x for (x, y) in sorted_corpus_train] train_labels = [y for (x, y) in sorted_corpus_train] sorted_corpus_valid = sorted(zip(self.valid['X'], self.valid['y']), key=lambda z: (len(z[0]), z[1])) valid_samples = [x for (x, y) in sorted_corpus_valid] valid_labels = [y for (x, y) in sorted_corpus_valid] sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), key=lambda z: (len(z[0]), z[1])) test_samples = [x for (x, y) in sorted_corpus_test] test_labels = [y for (x, y) in sorted_corpus_test] # Get train embeddings for ii in range(0, len(train_labels), params.batch_size): batch = train_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) train_embeddings.append(embeddings) train_embeddings = np.vstack(train_embeddings) logging.info('Computed train embeddings') # Get validation embeddings for ii in range(0, len(valid_labels), params.batch_size): batch = valid_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) valid_embeddings.append(embeddings) valid_embeddings = np.vstack(valid_embeddings) logging.info('Computed Validation embeddings') # Get test embeddings for ii in range(0, len(test_labels), params.batch_size): batch = test_samples[ii:ii + params.batch_size] embeddings = batcher(params, batch) test_embeddings.append(embeddings) test_embeddings = np.vstack(test_embeddings) logging.info('Computed test embeddings') config= {'nclasses': 7, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': True} config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(X={'train': train_embeddings, 'valid': valid_embeddings, 'test': test_embeddings}, y={'train': train_labels, 'valid': valid_labels, 'test': test_labels}, config=config) devacc, testacc = clf.run() logging.debug('Dev acc : {0} Test acc : {1} for PICO\n' .format(devacc, testacc)) return {'devacc': devacc, 'acc': testacc, 'ndev': len(self.valid['X']), 'ntest': len(self.test['X'])}
def run(self, params, batcher): self.X, self.y = {}, {} dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} for key in self.data: if key not in self.X: self.X[key] = [] if key not in self.y: self.y[key] = [] input1, input2, mylabels = self.data[key] enc_input = [] n_labels = len(mylabels) for ii in range(0, n_labels, params.batch_size): batch1 = input1[ii:ii + params.batch_size] batch2 = input2[ii:ii + params.batch_size] if len(batch1) == len(batch2) and len(batch1) > 0: enc1 = batcher(params, batch1) enc2 = batcher(params, batch2) enc_input.append( np.hstack( (enc1, enc2, enc1 * enc2, np.abs(enc1 - enc2)))) if (ii * params.batch_size) % (20000 * params.batch_size) == 0: logging.info("PROGRESS (encoding): %.2f%%" % (100 * ii / n_labels)) self.X[key] = np.vstack(enc_input) self.y[key] = [dico_label[y] for y in mylabels] ndev = len(self.data['valid'][0]) ntest = len(self.data['test'][0]) self.data = None config = { 'nclasses': 3, 'seed': self.seed, 'usepytorch': params.usepytorch, 'cudaEfficient': True, 'nhid': params.nhid, 'noreg': True } embeddings = [] index = 0 X_indexes = dict() for key in ['train', 'valid', 'test']: X_indexes[key] = [] for embedding in self.X[key]: embeddings.append(embedding) X_indexes[key].append(index) index += 1 X_indexes[key] = np.vstack(X_indexes[key]) del self.X[key] self.X = None embeddings = np.vstack(embeddings) config_classifier = copy.deepcopy(params.classifier) config_classifier['max_epoch'] = 15 config_classifier['epoch_size'] = 1 config['classifier'] = config_classifier clf = SplitClassifier(X_indexes, self.y, embeddings, config) devacc, testacc = clf.run() logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'.format( devacc, testacc)) return {'devacc': devacc, 'acc': testacc, 'ndev': ndev, 'ntest': ntest}
def run(self, params, batcher): sick_embed = {"train": {}, "dev": {}, "test": {}} bsize = params.batch_size for key in self.sick_data: logging.info("Computing embedding for {0}".format(key)) # Sort to reduce padding sorted_corpus = sorted( zip( self.sick_data[key]["X_A"], self.sick_data[key]["X_B"], self.sick_data[key]["y"], ), key=lambda z: (len(z[0]), len(z[1]), z[2]), ) self.sick_data[key]["X_A"] = [x for (x, y, z) in sorted_corpus] self.sick_data[key]["X_B"] = [y for (x, y, z) in sorted_corpus] self.sick_data[key]["y"] = [z for (x, y, z) in sorted_corpus] for txt_type in ["X_A", "X_B"]: sick_embed[key][txt_type] = [] for ii in range(0, len(self.sick_data[key]["y"]), bsize): batch = self.sick_data[key][txt_type][ii:ii + bsize] embeddings = batcher(params, batch) sick_embed[key][txt_type].append(embeddings) sick_embed[key][txt_type] = np.vstack( sick_embed[key][txt_type]) logging.info("Computed {0} embeddings".format(key)) # Train trainA = sick_embed["train"]["X_A"] trainB = sick_embed["train"]["X_B"] trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] trainY = np.array(self.sick_data["train"]["y"]) # Dev devA = sick_embed["dev"]["X_A"] devB = sick_embed["dev"]["X_B"] devF = np.c_[np.abs(devA - devB), devA * devB] devY = np.array(self.sick_data["dev"]["y"]) # Test testA = sick_embed["test"]["X_A"] testB = sick_embed["test"]["X_B"] testF = np.c_[np.abs(testA - testB), testA * testB] testY = np.array(self.sick_data["test"]["y"]) config = { "nclasses": 3, "seed": self.seed, "usepytorch": params.usepytorch, "classifier": params.classifier, "nhid": params.nhid, } clf = SplitClassifier( X={ "train": trainF, "valid": devF, "test": testF }, y={ "train": trainY, "valid": devY, "test": testY }, config=config, ) devacc, testacc = clf.run() devacc = devacc.tolist() logging.debug("\nDev acc : {0} Test acc : {1} for \ SICK entailment\n".format(devacc, testacc)) return { "devacc": devacc, "acc": testacc, "ndev": len(devA), "ntest": len(testA), }