def callback(verbose=True): train_labels, train_scores = get_label_score(clf, train_iterator, cuda_device, 'label', input_names=input_names) train_predicts = train_scores.argmax(axis=-1) train_scores = train_scores[:, 1] if verbose: print('train_acc: %.2f' % sklearn.metrics.accuracy_score(train_labels, train_predicts)) print( 'train_precision: %.2f' % sklearn.metrics.precision_score(train_labels, train_predicts)) print('train_average_precision: %.2f' % sklearn.metrics.average_precision_score( train_labels, train_scores)) dev_labels, dev_scores = get_label_score(clf, dev_iterator, cuda_device, 'label', input_names=input_names) dev_predicts = dev_scores.argmax(axis=-1) dev_scores = dev_scores[:, 1] if verbose: print('dev_acc: %.2f' % sklearn.metrics.accuracy_score(dev_labels, dev_predicts)) print('dev_precision: %.2f' % sklearn.metrics.precision_score(dev_labels, dev_predicts)) print('dev_average_precision: %.2f' % sklearn.metrics.average_precision_score( dev_labels, dev_scores)) index = 0 aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(dev_ref): query_scores = dev_scores[index:index + len(query_labels)] index += len(query_labels) aps.append( sklearn.metrics.average_precision_score( query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append( 1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) if verbose: print('dev_MAP: %.2f' % np.mean(aps)) print('dev_MRR: %.2f' % np.mean(rrs)) return np.mean(aps)
def main(config_path): with open(config_path, 'r') as fread: config_dict = json.load(fread) # path path_config = config_dict['Path'] model_dir = path_config['model_dir'] train = path_config['train'] dev = path_config['dev'] dev_ref = path_config['dev_ref'] test = path_config['test'] test_ref = path_config['test_ref'] test_result = path_config['test_result'] print('Loading question analysis models...') category_model = BaselineCategoryClassifier.load( path_config['category_model_config']) focus_model = BaselineFocusClassifier.load( path_config['focus_model_config']) words_embed, words_vocab = load_full_embedding_with_vocab( path_config['embed_dir']) with open(path_config['category_vocab'], 'rb') as fread: category_vocab = pickle.load(fread) # dataset dataset_config = config_dict['Dataset'] pad_size = dataset_config['pad_size'] batch_size = dataset_config['batch_size'] print('Loading train data...') train_reader = WikiqaBaselineReader(train, category_model, focus_model, words_vocab.stoi, category_vocab.itos, PAD_TOKEN='<pad>', pad_size=pad_size) dev_reader = WikiqaBaselineReader(dev, category_model, focus_model, words_vocab.stoi, category_vocab.itos, PAD_TOKEN='<pad>', pad_size=pad_size) vocabs = {'q_words': words_vocab, 'a_words': words_vocab} train_reader.set_vocabs(vocabs) dev_reader.set_vocabs(vocabs) train_iterator = train_reader.get_dataset_iterator(batch_size, train=True) dev_iterator = dev_reader.get_dataset_iterator(batch_size, train=False, sort=False) # model model_config = config_dict['Model'] conv_width = model_config['conv_width'] out_channels = model_config['out_channels'] hidden_size = model_config['hidden_size'] cuda_device = model_config['cuda_device'] clf = BaselineAnswerSelectionClassifier(words_embed=words_embed, out_channels=out_channels, conv_width=conv_width, hidden_size=hidden_size, cuda_device=cuda_device) # train train_config = config_dict['Train'] num_epoch = train_config['epoch'] weight_decay = train_config['weight_decay'] lr = train_config['lr'] early_stopping = train_config['early_stopping'] input_names = [ 'q_words', 'a_words', 'q_word_over', 'a_word_over', 'q_sem_over', 'a_sem_over' ] optimizer = optim.Adam(clf.parameters(), lr=lr, weight_decay=weight_decay, eps=1e-5) if cuda_device is not None: clf.cuda(device=cuda_device) def callback(verbose=True): train_labels, train_scores = get_label_score(clf, train_iterator, cuda_device, 'label', input_names=input_names) train_predicts = train_scores.argmax(axis=-1) train_scores = train_scores[:, 1] if verbose: print('train_acc: %.2f' % sklearn.metrics.accuracy_score(train_labels, train_predicts)) print( 'train_precision: %.2f' % sklearn.metrics.precision_score(train_labels, train_predicts)) print('train_average_precision: %.2f' % sklearn.metrics.average_precision_score( train_labels, train_scores)) dev_labels, dev_scores = get_label_score(clf, dev_iterator, cuda_device, 'label', input_names=input_names) dev_predicts = dev_scores.argmax(axis=-1) dev_scores = dev_scores[:, 1] if verbose: print('dev_acc: %.2f' % sklearn.metrics.accuracy_score(dev_labels, dev_predicts)) print('dev_precision: %.2f' % sklearn.metrics.precision_score(dev_labels, dev_predicts)) print('dev_average_precision: %.2f' % sklearn.metrics.average_precision_score( dev_labels, dev_scores)) index = 0 aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(dev_ref): query_scores = dev_scores[index:index + len(query_labels)] index += len(query_labels) aps.append( sklearn.metrics.average_precision_score( query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append( 1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) # if verbose: # print('DEBUGGING ap:', aps[-1]) # print('DEBUGGING rel_best:', query_rel_best) # print('DEBUGGING score:', query_scores) # print('DEBUGGING labels:', query_labels) # print('DEBUGGING RR:', rrs[-1]) # print() if verbose: print('dev_MAP: %.2f' % np.mean(aps)) print('dev_MRR: %.2f' % np.mean(rrs)) return np.mean(aps) print('Training...') best_state_dict = train_model(clf, optimizer, train_iterator, label_name='label', num_epoch=num_epoch, cuda_device=cuda_device, early_stopping=early_stopping, input_names=input_names, callback=callback) print() if best_state_dict is not None: clf.load_state_dict(best_state_dict) torch.save(clf.state_dict(), os.path.join(model_dir, './net.pt')) # test print('Loading test data...') test_reader = WikiqaBaselineReader(test, category_model, focus_model, words_vocab.stoi, category_vocab.itos, PAD_TOKEN='<pad>', pad_size=pad_size) test_reader.set_vocabs(vocabs) test_iterator = test_reader.get_dataset_iterator(batch_size, train=False, sort=False) print('Testing...') test_labels, test_scores = get_label_score(clf, test_iterator, cuda_device, 'label', input_names=input_names) test_predicts = test_scores.argmax(axis=-1) test_scores = test_scores[:, 1] print('test_acc: %.2f' % sklearn.metrics.accuracy_score(test_labels, test_predicts)) print('test_precision: %.2f' % sklearn.metrics.precision_score(test_labels, test_predicts)) print('test_average_precision: %.2f' % sklearn.metrics.average_precision_score(test_labels, test_scores)) index = 0 aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(test_ref): query_scores = test_scores[index:index + len(query_labels)] index += len(query_labels) aps.append( sklearn.metrics.average_precision_score(query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append(1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) print('test_MAP: %.2f' % np.mean(aps)) print('test_MRR: %.2f' % np.mean(rrs))
def main(config_path): with open(config_path, 'r') as fread: config_dict = json.load(fread) # path path_config = config_dict['Path'] model_dir = path_config['model_dir'] train = path_config['train'] dev = path_config['dev'] dev_ref = path_config['dev_ref'] test = path_config['test'] test_ref = path_config['test_ref'] # dataset dataset_config = config_dict['Dataset'] batch_size = dataset_config['batch_size'] print('Loading train data...') train_reader = WikiqaReader(train, PAD_TOKEN='<pad>') dev_reader = WikiqaReader(dev, PAD_TOKEN='<pad>') words_embed, words_vocab = load_full_embedding_with_vocab( path_config['embed_dir']) vocabs = {'q_words': words_vocab, 'a_words': words_vocab} train_reader.set_vocabs(vocabs) dev_reader.set_vocabs(vocabs) train_iterator = train_reader.get_dataset_iterator(batch_size, train=True) dev_iterator = dev_reader.get_dataset_iterator(batch_size, train=False, sort=False) test_reader = WikiqaReader(test, PAD_TOKEN='<pad>') test_reader.set_vocabs(vocabs) test_iterator = test_reader.get_dataset_iterator(batch_size, train=False, sort=False) # model model_config = config_dict['Model'] conv_width = model_config['conv_width'] out_channels = model_config['out_channels'] hidden_size = model_config['hidden_size'] cuda_device = model_config['cuda_device'] dropout = model_config['dropout'] h = model_config['h'] clf = SelfAttentionCnnClassifier(words_embed=words_embed, out_channels=out_channels, conv_width=conv_width, hidden_size=hidden_size, cuda_device=cuda_device, h=h, dropout=dropout) # train train_config = config_dict['Train'] num_epoch = train_config['epoch'] weight_decay = train_config['weight_decay'] lr = train_config['lr'] early_stopping = train_config['early_stopping'] factor = train_config['factor'] warmup = train_config['warmup'] input_names = ['q_words', 'a_words'] # optimizer = optim.Adam(clf.parameters(), lr=lr, weight_decay=weight_decay, eps=1e-5) optimizer = NoamOpt( clf.len_embed, factor, warmup, optim.Adam(clf.parameters(), lr=0, weight_decay=weight_decay, eps=1e-5)) if cuda_device is not None: clf.cuda(device=cuda_device) def callback(verbose=True): train_labels, train_scores = get_label_score(clf, train_iterator, cuda_device, 'label', input_names=input_names) train_predicts = train_scores.argmax(axis=-1) train_scores = train_scores[:, 1] if verbose: print('train_acc: %.2f' % sklearn.metrics.accuracy_score(train_labels, train_predicts)) print( 'train_precision: %.2f' % sklearn.metrics.precision_score(train_labels, train_predicts)) print('train_average_precision: %.2f' % sklearn.metrics.average_precision_score( train_labels, train_scores)) dev_labels, dev_scores = get_label_score(clf, dev_iterator, cuda_device, 'label', input_names=input_names) dev_predicts = dev_scores.argmax(axis=-1) dev_scores = dev_scores[:, 1] if verbose: print('dev_acc: %.2f' % sklearn.metrics.accuracy_score(dev_labels, dev_predicts)) print('dev_precision: %.2f' % sklearn.metrics.precision_score(dev_labels, dev_predicts)) print('dev_average_precision: %.2f' % sklearn.metrics.average_precision_score( dev_labels, dev_scores)) index = 0 dev_aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(dev_ref): query_scores = dev_scores[index:index + len(query_labels)] index += len(query_labels) dev_aps.append( sklearn.metrics.average_precision_score( query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append( 1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) if verbose: print('dev_MAP: %.2f' % np.mean(dev_aps)) print('dev_MRR: %.2f' % np.mean(rrs)) test_labels, test_scores = get_label_score(clf, test_iterator, cuda_device, 'label', input_names=input_names) test_predicts = test_scores.argmax(axis=-1) test_scores = test_scores[:, 1] if verbose: print('test_acc: %.2f' % sklearn.metrics.accuracy_score(test_labels, test_predicts)) print('test_precision: %.2f' % sklearn.metrics.precision_score(test_labels, test_predicts)) print('test_average_precision: %.2f' % sklearn.metrics.average_precision_score( test_labels, test_scores)) index = 0 test_aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(test_ref): query_scores = test_scores[index:index + len(query_labels)] index += len(query_labels) test_aps.append( sklearn.metrics.average_precision_score( query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append( 1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) if verbose: print('test_MAP: %.2f' % np.mean(test_aps)) print('test_MRR: %.2f' % np.mean(rrs)) return np.mean(dev_aps) print('Training...') best_state_dict = train_model(clf, optimizer, train_iterator, label_name='label', num_epoch=num_epoch, cuda_device=cuda_device, early_stopping=early_stopping, input_names=input_names, callback=callback) print() if best_state_dict is not None: clf.load_state_dict(best_state_dict) torch.save(clf.state_dict(), os.path.join(model_dir, './net.pt')) print('Testing...') test_labels, test_scores = get_label_score(clf, test_iterator, cuda_device, 'label', input_names=input_names) test_predicts = test_scores.argmax(axis=-1) test_scores = test_scores[:, 1] print('test_acc: %.2f' % sklearn.metrics.accuracy_score(test_labels, test_predicts)) print('test_precision: %.2f' % sklearn.metrics.precision_score(test_labels, test_predicts)) print('test_average_precision: %.2f' % sklearn.metrics.average_precision_score(test_labels, test_scores)) index = 0 aps = [] # for mean average precision score rrs = [] # for mean reciprocal rank score for query_labels in filtered_ref_generator(test_ref): query_scores = test_scores[index:index + len(query_labels)] index += len(query_labels) aps.append( sklearn.metrics.average_precision_score(query_labels, query_scores)) query_rel_best = np.argmin(-query_scores * query_labels) rrs.append(1 / (np.argsort(np.argsort(-query_scores))[query_rel_best] + 1)) print('test_MAP: %.4f' % np.mean(aps)) print('test_MRR: %.4f' % np.mean(rrs))