def evaluate_validation_set(model, seqs, golds, lengths, sentences, criterion, labelset, compute_auc=False): y_true = list() y_pred = list() y_probs = list() total_loss = 0 for batch, targets, lengths, raw_data, _ in seqs2minibatches(seqs, golds, lengths, sentences, batch_size=1): batch, targets, lengths = sort_batch(batch, targets, lengths) pred = model(batch) loss = criterion(pred, targets) pred_idx = torch.max(pred, 1)[1] y_probs.append(torch.exp(pred)) y_true += list(targets.int()) y_pred += list(pred_idx.data.int()) total_loss += loss results = p_r_f(y_true, y_pred, labelset) cm = confusion_matrix(y_true, y_pred, labels=[i for i in range(len(labelset))]) results['cm'] = cm if compute_auc is True: y_true = [int(elm) for elm in y_true] aucs, precs, recs, thr = get_auc( y_true, torch.cat(y_probs, dim=0).detach().numpy(), labelset) results['aucs'] = aucs return total_loss.data.float() / len(seqs), results
def evaluate_validation_set(model, seqs, golds, lengths, sentences, criterion, labelset): y_true = list() y_pred = list() total_loss = 0 for batch, targets, lengths, raw_data in seqs2minibatches(seqs, golds, lengths, sentences, batch_size=1): batch, targets, lengths = sort_batch(batch, targets, lengths) pred = model(batch) loss = criterion(pred, targets) pred_idx = torch.max(pred, 1)[1] y_true += list(targets.int()) y_pred += list(pred_idx.data.int()) total_loss += loss results = p_r_f(y_true, y_pred, labelset) return total_loss.data.float() / len(seqs), results
train_labels = d['train']['label'] labelset = list(set(train_labels)) pmis, coocs = compute_pmis(train_seqs, train_labels, labelset) sorted_keys = sorted(list(pmis.keys()), key=lambda x: np.max(pmis[x]), reverse=True) for tok in sorted_keys: vals = pmis[tok] print(tok, vals, [coocs[tok][label] for label in labelset]) dev_seqs = sents2seqs(d['dev']['seq']) dev_labels = d['dev']['label'] # predict dev_preds, oovs, multis = predict(dev_seqs, pmis, coocs, thr=10) print(dev_labels) print(dev_preds) label2idx = {label: labelset.index(label) for label in labelset} results = p_r_f([label2idx[label] for label in dev_labels], [label2idx[label] for label in dev_preds], labelset) print(print_result_summary(results)) print('\nOOVs" {} {}'.format(len(oovs), 100*len(oovs)/float(len(dev_seqs)))) for key, val in Counter(oovs).most_common(): print(key, val) print('\nMultis: {} {}'.format(len(multis), 100*len(multis)/float(len(dev_seqs)))) for key, val in Counter(multis).most_common(): print(key, val)
def main(args): # read params from csv and update the arguments if args.hyperparam_csv != '': csv_params = param_reader.read_hyperparams_from_csv( args.hyperparam_csv, args.rowid) vars(args).update(csv_params) seed = args.seed num_epochs = args.epochs batch_size = args.bs embedding_dim = args.emb_dim num_feature_maps = args.num_feature_maps kernel_size = args.ks lr = args.lr p = args.dropout use_pretrained_embeddings = args.embs datafile = args.data max_vocab = args.max_vocab additional_data_file = args.additional_data torch.manual_seed(seed) np.random.seed(seed) setup_logging(logfile='log_cnn_{}.log'.format(args.rowid)) log_params(vars(args)) config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation()) config.read(args.config) feature_extractor = sents2seqs if use_pretrained_embeddings is True: embeddings_file = config.get('Files', 'emb_file') logging.info( 'Loading pretrained embedding from {}'.format(embeddings_file)) pretrained_embeddings, word2idx, idx2word = load_embeddings_from_file( embeddings_file, max_vocab=max_vocab) else: pretrained_embeddings, word2idx, idx2word = None, None, None d = load_json(datafile) if additional_data_file != '': additional_data_file = config.get( 'Files', 'additional_data_{}'.format(args.additional_data)) logging.info( 'Loading additional data from {}'.format(additional_data_file)) additional_data = load_json(additional_data_file) else: additional_data = {'seq': [], 'label': []} sentences = [prefix_sequence(sent, 'en') for sent in d['train']['seq']] + [ prefix_sequence(sent, 'ru') for sent in additional_data['seq'] ] labels = d['train']['label'] + additional_data['label'] if args.upsample is True: logging.info('Upsampling the train data') sentences, labels = upsample(sentences, labels) dev_sentences = [prefix_sequence(sent, 'en') for sent in d['dev']['seq']] dev_labels = d['dev']['label'] # prepare train set seqs, lengths, word2idx = feature_extractor(sentences, word2idx) embeded_seqs = np.vstack( [embed(seq, pretrained_embeddings) for seq in seqs]) logging.info('Vocabulary has {} entries'.format(len(word2idx))) logging.info(word2idx) golds, labelset = prepare_labels(labels, None) # prepare dev set dev_seqs, dev_lengths, _ = feature_extractor(dev_sentences, word2idx) embeded_dev_seqs = np.vstack( [embed(seq, pretrained_embeddings) for seq in dev_seqs]) dev_golds, _ = prepare_labels(dev_labels, labelset) model = SGDClassifier() logging.info('Fitting the data') model.fit(embeded_seqs, golds.numpy()) logging.info('Predicting') train_preds = model.predict(embeded_seqs) dev_preds = model.predict(embeded_dev_seqs) # predict the train data train_results = p_r_f(golds, train_preds, labelset) # predict the val data dev_results = p_r_f(dev_golds, dev_preds, labelset) logging.info('Summary train') logging.info(print_result_summary(train_results)) logging.info('Summary dev') logging.info(print_result_summary(dev_results)) dev_results['best_epoch'] = 1 dev_results['best_macro_f'] = dev_results['macro'][2] param_reader.write_results_and_hyperparams(args.result_csv, dev_results, vars(args)) test_sentences = [prefix_sequence(sent, 'en') for sent in d['test']['seq']] test_labels = d['test']['label'] # prepare test set test_seqs, test_lengths, _ = feature_extractor(test_sentences, word2idx) embeded_test_seqs = np.vstack( [embed(seq, pretrained_embeddings) for seq in test_seqs]) test_golds, _ = prepare_labels(test_labels, labelset) test_preds = model.predict(embeded_test_seqs) # predict the train data test_results = p_r_f(test_golds, test_preds, labelset) logging.info('Summary test') logging.info(print_result_summary(test_results)) """
def main(args): # read params from csv and update the arguments if args.hyperparam_csv != '': csv_params = param_reader.read_hyperparams_from_csv( args.hyperparam_csv, args.rowid) vars(args).update(csv_params) config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation()) config.read(args.config) seed = args.seed num_epochs = args.epochs batch_size = args.bs embedding_dim = args.emb_dim num_feature_maps = args.num_feature_maps kernel_size = args.ks lr = args.lr p = args.dropout use_pretrained_embeddings = args.embs datafile = config.get('Files', 'data_{}'.format(args.data_split)) max_vocab = args.max_vocab additional_data_file = args.additional_data torch.manual_seed(seed) np.random.seed(seed) random_name = uuid.uuid4().hex setup_logging(logfile='{}.log'.format(random_name)) pred_file = os.path.join(args.pred_dir, '{}.preds'.format(random_name)) vars(args).update({'pred_file': pred_file}) log_params(vars(args)) feature_extractor = sents2seqs if use_pretrained_embeddings is True: embeddings_file = config.get('Files', 'emb_file') logging.info( 'Loading pretrained embedding from {}'.format(embeddings_file)) pretrained_embeddings, word2idx, idx2word = load_embeddings_from_file( embeddings_file, max_vocab=max_vocab) else: pretrained_embeddings, word2idx, idx2word = None, None, None d = load_json(datafile) if additional_data_file != '': additional_data_file = config.get( 'Files', 'additional_data_{}'.format(args.additional_data)) logging.info( 'Loading additional data from {}'.format(additional_data_file)) additional_data = load_json(additional_data_file) else: additional_data = {'seq': [], 'label': []} sentences = [ prefix_sequence(sent, 'en', strip_hs=args.strip) for sent in d['train']['seq'] ] + [ prefix_sequence(sent, 'ru', strip_hs=args.strip) for sent in additional_data['seq'] ] labels = d['train']['label'] + additional_data['label'] if args.upsample is True: logging.info('Upsampling the train data') sentences, labels = upsample(sentences, labels) dev_sentences = [ prefix_sequence(sent, 'en', strip_hs=args.strip) for sent in d['dev']['seq'] ] dev_labels = d['dev']['label'] dev_tids = d['dev']['tid'] dev_raw_sentences = d['dev']['seq'] # prepare train set seqs, lengths, word2idx = feature_extractor(sentences, word2idx) logging.info('Vocabulary has {} entries'.format(len(word2idx))) logging.info(word2idx) golds, labelset = prepare_labels(labels, None) # prepare dev set dev_seqs, dev_lengths, _ = feature_extractor(dev_sentences, word2idx) dev_golds, _ = prepare_labels(dev_labels, labelset) model = CNN(embedding_dim=embedding_dim, num_feature_maps=num_feature_maps, pretrained_embeddings=pretrained_embeddings, kernel_size=kernel_size, vocab_size=len(word2idx), labelset_size=len(labelset), dropout=p) loss_function = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=lr) dev_loss, results = evaluate_validation_set(model=model, seqs=dev_seqs, golds=dev_golds, lengths=dev_lengths, sentences=dev_sentences, criterion=loss_function, labelset=labelset) logging.info('Epoch {}: val f_macro {:.4f}'.format(-1, results['macro'][2])) logging.info('Summary val') logging.info(print_result_summary(results)) best_epoch = 0 best_macro_f = 0 for epoch in range(num_epochs): preds = [] gold_labels = [] total_loss = 0 for seqs_batch, gold_batch, lengths_batch, raw_batch, _ in seqs2minibatches( seqs, golds, lengths, sentences, batch_size=batch_size): seqs_batch, gold_batch, lengths_batch = sort_batch( seqs_batch, gold_batch, lengths_batch) model.zero_grad() out = model(seqs_batch) loss = loss_function(out, gold_batch) loss.backward() optimizer.step() total_loss += loss pred_idx = torch.max(out, 1)[1] gold_labels += list(gold_batch.int()) preds += list(pred_idx.data.int()) # predict the train data train_results = p_r_f(gold_labels, preds, labelset) # predict the val data dev_loss, dev_results = evaluate_validation_set( model=model, seqs=dev_seqs, golds=dev_golds, lengths=dev_lengths, sentences=dev_sentences, criterion=loss_function, labelset=labelset) if dev_results['macro'][2] > best_macro_f: best_macro_f = dev_results['macro'][2] best_epoch = epoch logging.info( 'Epoch {}: Train loss {:.4f}, train f_macro {:.4f}, val loss {:.4f}, val f_macro {:.4f}, best_epoch {}, best val_f_macro {:.4f}' .format(epoch, total_loss.data.float() / len(seqs), train_results['macro'][2], dev_loss, dev_results['macro'][2], best_epoch, best_macro_f)) logging.info('Summary train') logging.info(print_result_summary(train_results)) logging.info('Summary dev') logging.info(print_result_summary(dev_results)) #logging.info('Summary dev_up') #logging.info(print_result_summary(dev_results_up)) dev_loss, dev_results = evaluate_validation_set(model=model, seqs=dev_seqs, golds=dev_golds, lengths=dev_lengths, sentences=dev_sentences, criterion=loss_function, labelset=labelset, compute_auc=True) logging.info(print_auc_summary(dev_results['aucs'], labelset)) dev_results['best_epoch'] = best_epoch dev_results['best_macro_f'] = best_macro_f param_reader.write_results_and_hyperparams(args.result_csv, dev_results, vars(args), labelset) write_predictions(model, dev_seqs, dev_golds, dev_lengths, dev_raw_sentences, dev_tids, labelset, pred_file, write_probs=True) if args.predict_test is True: # Prepare test data test_sentences = [ prefix_sequence(sent, 'en', strip_hs=args.strip) for sent in d['test']['seq'] ] test_labels = d['test']['label'] test_seqs, test_lengths, _ = feature_extractor(test_sentences, word2idx) test_golds, _ = prepare_labels(test_labels, labelset) test_tids = d['test']['tid'] test_raw_sentences = d['test']['seq'] test_loss, test_results = evaluate_validation_set( model=model, seqs=test_seqs, golds=test_golds, lengths=test_lengths, sentences=test_sentences, criterion=loss_function, labelset=labelset, compute_auc=True) logging.info('Summary test') logging.info(print_result_summary(test_results)) param_reader.write_results_and_hyperparams(args.test_result_csv, test_results, vars(args), labelset) write_predictions(model, test_seqs, test_golds, test_lengths, test_raw_sentences, test_tids, labelset, pred_file + '.test', write_probs=True) if args.predict_all is True: # prepare the data to be predicted pred_data = load_json(config.get('Files', 'unlabeled')) test_sentences = [ prefix_sequence(sent, 'en', strip_hs=args.strip) for sent in pred_data['seq'] ] test_seqs, test_lengths, _ = feature_extractor(test_sentences, word2idx) test_tids = pred_data['tid'] test_raw_sentences = pred_data['seq'] logging.info('Predicting the unlabeled data') write_predictions(model, test_seqs, torch.LongTensor(np.array([0 for elm in test_seqs])), test_lengths, test_raw_sentences, test_tids, labelset, pred_file + '.unlabeled', write_probs=True)