Example #1
0
def evaluate_validation_set(model,
                            seqs,
                            golds,
                            lengths,
                            sentences,
                            criterion,
                            labelset,
                            compute_auc=False):
    y_true = list()
    y_pred = list()
    y_probs = list()
    total_loss = 0
    for batch, targets, lengths, raw_data, _ in seqs2minibatches(seqs,
                                                                 golds,
                                                                 lengths,
                                                                 sentences,
                                                                 batch_size=1):

        batch, targets, lengths = sort_batch(batch, targets, lengths)
        pred = model(batch)
        loss = criterion(pred, targets)
        pred_idx = torch.max(pred, 1)[1]
        y_probs.append(torch.exp(pred))
        y_true += list(targets.int())
        y_pred += list(pred_idx.data.int())
        total_loss += loss

    results = p_r_f(y_true, y_pred, labelset)
    cm = confusion_matrix(y_true,
                          y_pred,
                          labels=[i for i in range(len(labelset))])
    results['cm'] = cm
    if compute_auc is True:
        y_true = [int(elm) for elm in y_true]
        aucs, precs, recs, thr = get_auc(
            y_true,
            torch.cat(y_probs, dim=0).detach().numpy(), labelset)
        results['aucs'] = aucs
    return total_loss.data.float() / len(seqs), results
Example #2
0
def evaluate_validation_set(model, seqs, golds, lengths, sentences, criterion,
                            labelset):
    y_true = list()
    y_pred = list()
    total_loss = 0
    for batch, targets, lengths, raw_data in seqs2minibatches(seqs,
                                                              golds,
                                                              lengths,
                                                              sentences,
                                                              batch_size=1):

        batch, targets, lengths = sort_batch(batch, targets, lengths)
        pred = model(batch)

        loss = criterion(pred, targets)
        pred_idx = torch.max(pred, 1)[1]
        y_true += list(targets.int())
        y_pred += list(pred_idx.data.int())
        total_loss += loss

    results = p_r_f(y_true, y_pred, labelset)

    return total_loss.data.float() / len(seqs), results
Example #3
0
    train_labels = d['train']['label']


    labelset = list(set(train_labels))

    pmis, coocs = compute_pmis(train_seqs, train_labels, labelset)

    sorted_keys = sorted(list(pmis.keys()), key=lambda x: np.max(pmis[x]), reverse=True)

    for tok in sorted_keys:
        vals = pmis[tok]
        print(tok, vals, [coocs[tok][label] for label in labelset])

    dev_seqs = sents2seqs(d['dev']['seq'])
    dev_labels = d['dev']['label']

    # predict

    dev_preds, oovs, multis = predict(dev_seqs, pmis, coocs, thr=10)
    print(dev_labels)
    print(dev_preds)
    label2idx = {label: labelset.index(label) for label in labelset}
    results = p_r_f([label2idx[label] for label in dev_labels], [label2idx[label] for label in dev_preds], labelset)
    print(print_result_summary(results))

    print('\nOOVs" {} {}'.format(len(oovs), 100*len(oovs)/float(len(dev_seqs))))
    for key, val in Counter(oovs).most_common():
        print(key, val)
    print('\nMultis: {} {}'.format(len(multis),  100*len(multis)/float(len(dev_seqs))))
    for key, val in Counter(multis).most_common():
        print(key, val)
Example #4
0
def main(args):

    # read params from csv and update the arguments
    if args.hyperparam_csv != '':
        csv_params = param_reader.read_hyperparams_from_csv(
            args.hyperparam_csv, args.rowid)
        vars(args).update(csv_params)

    seed = args.seed
    num_epochs = args.epochs
    batch_size = args.bs
    embedding_dim = args.emb_dim
    num_feature_maps = args.num_feature_maps
    kernel_size = args.ks
    lr = args.lr
    p = args.dropout
    use_pretrained_embeddings = args.embs
    datafile = args.data
    max_vocab = args.max_vocab
    additional_data_file = args.additional_data

    torch.manual_seed(seed)
    np.random.seed(seed)
    setup_logging(logfile='log_cnn_{}.log'.format(args.rowid))

    log_params(vars(args))

    config = configparser.ConfigParser(
        interpolation=configparser.ExtendedInterpolation())
    config.read(args.config)

    feature_extractor = sents2seqs

    if use_pretrained_embeddings is True:
        embeddings_file = config.get('Files', 'emb_file')
        logging.info(
            'Loading pretrained embedding from {}'.format(embeddings_file))
        pretrained_embeddings, word2idx, idx2word = load_embeddings_from_file(
            embeddings_file, max_vocab=max_vocab)
    else:
        pretrained_embeddings, word2idx, idx2word = None, None, None

    d = load_json(datafile)

    if additional_data_file != '':
        additional_data_file = config.get(
            'Files', 'additional_data_{}'.format(args.additional_data))
        logging.info(
            'Loading additional data from {}'.format(additional_data_file))
        additional_data = load_json(additional_data_file)
    else:
        additional_data = {'seq': [], 'label': []}
    sentences = [prefix_sequence(sent, 'en') for sent in d['train']['seq']] + [
        prefix_sequence(sent, 'ru') for sent in additional_data['seq']
    ]
    labels = d['train']['label'] + additional_data['label']

    if args.upsample is True:
        logging.info('Upsampling the train data')
        sentences, labels = upsample(sentences, labels)

    dev_sentences = [prefix_sequence(sent, 'en') for sent in d['dev']['seq']]
    dev_labels = d['dev']['label']

    # prepare train set
    seqs, lengths, word2idx = feature_extractor(sentences, word2idx)
    embeded_seqs = np.vstack(
        [embed(seq, pretrained_embeddings) for seq in seqs])
    logging.info('Vocabulary has {} entries'.format(len(word2idx)))
    logging.info(word2idx)
    golds, labelset = prepare_labels(labels, None)

    # prepare dev set
    dev_seqs, dev_lengths, _ = feature_extractor(dev_sentences, word2idx)
    embeded_dev_seqs = np.vstack(
        [embed(seq, pretrained_embeddings) for seq in dev_seqs])
    dev_golds, _ = prepare_labels(dev_labels, labelset)

    model = SGDClassifier()
    logging.info('Fitting the data')
    model.fit(embeded_seqs, golds.numpy())
    logging.info('Predicting')
    train_preds = model.predict(embeded_seqs)

    dev_preds = model.predict(embeded_dev_seqs)

    # predict the train data

    train_results = p_r_f(golds, train_preds, labelset)
    # predict the val data
    dev_results = p_r_f(dev_golds, dev_preds, labelset)
    logging.info('Summary train')
    logging.info(print_result_summary(train_results))
    logging.info('Summary dev')
    logging.info(print_result_summary(dev_results))

    dev_results['best_epoch'] = 1

    dev_results['best_macro_f'] = dev_results['macro'][2]
    param_reader.write_results_and_hyperparams(args.result_csv, dev_results,
                                               vars(args))

    test_sentences = [prefix_sequence(sent, 'en') for sent in d['test']['seq']]
    test_labels = d['test']['label']
    # prepare test set
    test_seqs, test_lengths, _ = feature_extractor(test_sentences, word2idx)
    embeded_test_seqs = np.vstack(
        [embed(seq, pretrained_embeddings) for seq in test_seqs])
    test_golds, _ = prepare_labels(test_labels, labelset)
    test_preds = model.predict(embeded_test_seqs)
    # predict the train data
    test_results = p_r_f(test_golds, test_preds, labelset)
    logging.info('Summary test')
    logging.info(print_result_summary(test_results))
    """
Example #5
0
def main(args):

    # read params from csv and update the arguments
    if args.hyperparam_csv != '':
        csv_params = param_reader.read_hyperparams_from_csv(
            args.hyperparam_csv, args.rowid)
        vars(args).update(csv_params)

    config = configparser.ConfigParser(
        interpolation=configparser.ExtendedInterpolation())
    config.read(args.config)

    seed = args.seed
    num_epochs = args.epochs
    batch_size = args.bs
    embedding_dim = args.emb_dim
    num_feature_maps = args.num_feature_maps
    kernel_size = args.ks
    lr = args.lr
    p = args.dropout
    use_pretrained_embeddings = args.embs
    datafile = config.get('Files', 'data_{}'.format(args.data_split))
    max_vocab = args.max_vocab
    additional_data_file = args.additional_data

    torch.manual_seed(seed)
    np.random.seed(seed)

    random_name = uuid.uuid4().hex
    setup_logging(logfile='{}.log'.format(random_name))
    pred_file = os.path.join(args.pred_dir, '{}.preds'.format(random_name))

    vars(args).update({'pred_file': pred_file})
    log_params(vars(args))

    feature_extractor = sents2seqs

    if use_pretrained_embeddings is True:
        embeddings_file = config.get('Files', 'emb_file')
        logging.info(
            'Loading pretrained embedding from {}'.format(embeddings_file))
        pretrained_embeddings, word2idx, idx2word = load_embeddings_from_file(
            embeddings_file, max_vocab=max_vocab)
    else:
        pretrained_embeddings, word2idx, idx2word = None, None, None

    d = load_json(datafile)

    if additional_data_file != '':
        additional_data_file = config.get(
            'Files', 'additional_data_{}'.format(args.additional_data))
        logging.info(
            'Loading additional data from {}'.format(additional_data_file))
        additional_data = load_json(additional_data_file)
    else:
        additional_data = {'seq': [], 'label': []}
    sentences = [
        prefix_sequence(sent, 'en', strip_hs=args.strip)
        for sent in d['train']['seq']
    ] + [
        prefix_sequence(sent, 'ru', strip_hs=args.strip)
        for sent in additional_data['seq']
    ]
    labels = d['train']['label'] + additional_data['label']

    if args.upsample is True:
        logging.info('Upsampling the train data')
        sentences, labels = upsample(sentences, labels)

    dev_sentences = [
        prefix_sequence(sent, 'en', strip_hs=args.strip)
        for sent in d['dev']['seq']
    ]
    dev_labels = d['dev']['label']
    dev_tids = d['dev']['tid']
    dev_raw_sentences = d['dev']['seq']

    # prepare train set
    seqs, lengths, word2idx = feature_extractor(sentences, word2idx)
    logging.info('Vocabulary has {} entries'.format(len(word2idx)))
    logging.info(word2idx)
    golds, labelset = prepare_labels(labels, None)

    # prepare dev set
    dev_seqs, dev_lengths, _ = feature_extractor(dev_sentences, word2idx)
    dev_golds, _ = prepare_labels(dev_labels, labelset)

    model = CNN(embedding_dim=embedding_dim,
                num_feature_maps=num_feature_maps,
                pretrained_embeddings=pretrained_embeddings,
                kernel_size=kernel_size,
                vocab_size=len(word2idx),
                labelset_size=len(labelset),
                dropout=p)
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    dev_loss, results = evaluate_validation_set(model=model,
                                                seqs=dev_seqs,
                                                golds=dev_golds,
                                                lengths=dev_lengths,
                                                sentences=dev_sentences,
                                                criterion=loss_function,
                                                labelset=labelset)

    logging.info('Epoch {}: val f_macro {:.4f}'.format(-1,
                                                       results['macro'][2]))
    logging.info('Summary val')
    logging.info(print_result_summary(results))

    best_epoch = 0
    best_macro_f = 0
    for epoch in range(num_epochs):
        preds = []
        gold_labels = []
        total_loss = 0
        for seqs_batch, gold_batch, lengths_batch, raw_batch, _ in seqs2minibatches(
                seqs, golds, lengths, sentences, batch_size=batch_size):
            seqs_batch, gold_batch, lengths_batch = sort_batch(
                seqs_batch, gold_batch, lengths_batch)
            model.zero_grad()
            out = model(seqs_batch)
            loss = loss_function(out, gold_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss

            pred_idx = torch.max(out, 1)[1]
            gold_labels += list(gold_batch.int())
            preds += list(pred_idx.data.int())

        # predict the train data
        train_results = p_r_f(gold_labels, preds, labelset)
        # predict the val data

        dev_loss, dev_results = evaluate_validation_set(
            model=model,
            seqs=dev_seqs,
            golds=dev_golds,
            lengths=dev_lengths,
            sentences=dev_sentences,
            criterion=loss_function,
            labelset=labelset)
        if dev_results['macro'][2] > best_macro_f:
            best_macro_f = dev_results['macro'][2]
            best_epoch = epoch

        logging.info(
            'Epoch {}: Train loss {:.4f}, train f_macro {:.4f}, val loss {:.4f},  val f_macro {:.4f}, best_epoch {}, best val_f_macro {:.4f}'
            .format(epoch,
                    total_loss.data.float() / len(seqs),
                    train_results['macro'][2], dev_loss,
                    dev_results['macro'][2], best_epoch, best_macro_f))

        logging.info('Summary train')
        logging.info(print_result_summary(train_results))
        logging.info('Summary dev')
        logging.info(print_result_summary(dev_results))

        #logging.info('Summary dev_up')
        #logging.info(print_result_summary(dev_results_up))

    dev_loss, dev_results = evaluate_validation_set(model=model,
                                                    seqs=dev_seqs,
                                                    golds=dev_golds,
                                                    lengths=dev_lengths,
                                                    sentences=dev_sentences,
                                                    criterion=loss_function,
                                                    labelset=labelset,
                                                    compute_auc=True)
    logging.info(print_auc_summary(dev_results['aucs'], labelset))

    dev_results['best_epoch'] = best_epoch
    dev_results['best_macro_f'] = best_macro_f
    param_reader.write_results_and_hyperparams(args.result_csv, dev_results,
                                               vars(args), labelset)
    write_predictions(model,
                      dev_seqs,
                      dev_golds,
                      dev_lengths,
                      dev_raw_sentences,
                      dev_tids,
                      labelset,
                      pred_file,
                      write_probs=True)

    if args.predict_test is True:
        # Prepare test data
        test_sentences = [
            prefix_sequence(sent, 'en', strip_hs=args.strip)
            for sent in d['test']['seq']
        ]
        test_labels = d['test']['label']

        test_seqs, test_lengths, _ = feature_extractor(test_sentences,
                                                       word2idx)
        test_golds, _ = prepare_labels(test_labels, labelset)
        test_tids = d['test']['tid']
        test_raw_sentences = d['test']['seq']
        test_loss, test_results = evaluate_validation_set(
            model=model,
            seqs=test_seqs,
            golds=test_golds,
            lengths=test_lengths,
            sentences=test_sentences,
            criterion=loss_function,
            labelset=labelset,
            compute_auc=True)
        logging.info('Summary test')
        logging.info(print_result_summary(test_results))
        param_reader.write_results_and_hyperparams(args.test_result_csv,
                                                   test_results, vars(args),
                                                   labelset)
        write_predictions(model,
                          test_seqs,
                          test_golds,
                          test_lengths,
                          test_raw_sentences,
                          test_tids,
                          labelset,
                          pred_file + '.test',
                          write_probs=True)

    if args.predict_all is True:
        # prepare the data to be predicted
        pred_data = load_json(config.get('Files', 'unlabeled'))
        test_sentences = [
            prefix_sequence(sent, 'en', strip_hs=args.strip)
            for sent in pred_data['seq']
        ]
        test_seqs, test_lengths, _ = feature_extractor(test_sentences,
                                                       word2idx)

        test_tids = pred_data['tid']
        test_raw_sentences = pred_data['seq']
        logging.info('Predicting the unlabeled data')
        write_predictions(model,
                          test_seqs,
                          torch.LongTensor(np.array([0 for elm in test_seqs])),
                          test_lengths,
                          test_raw_sentences,
                          test_tids,
                          labelset,
                          pred_file + '.unlabeled',
                          write_probs=True)