Exemple #1
0
def run_pipeline(skip_corenlp=True, corenlp_dir=None, overwrite=False, extension='.xml', nice=False):

    output_dir = fh.makedirs(dirs.data_stanford_dir)
    temp_dir = fh.makedirs(dirs.data_raw_sentences_dir)
    xml_dir = fh.makedirs(output_dir, 'xml')

    # now done by preprocessing tools
    """
    # part 1
    print "Splitting files"
    split_into_files(input_filename, temp_dir)
    """

    # part 2
    if not skip_corenlp:
        filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt')
        text_files = glob.glob(os.path.join(temp_dir, '*.txt'))
        text_files.sort()
        files = []
        if overwrite:
            print "Reprocessing all files"
            files = text_files
        else:
            for f in text_files:
                basename = os.path.basename(f)
                if not os.path.exists(os.path.join(xml_dir, basename + extension)):
                    files.append(f)
        print len(files), "files to process"

        with open(filelist_filename, 'w') as output_file:
            for f in files:
                output_file.write(f + '\n')

        if len(files) > 0:
            properties_file = os.path.join(os.getcwd(), 'core', 'external', 'CoreNLP.properties')
            print "Calling corenlp"
            call_corenlp(filelist_filename, xml_dir, corenlp_dir, properties_file, nice)

    # part 3
    print "Parsing xml"
    xml_filelist_filename = fh.make_filename(output_dir, 'xml_filelist', 'txt')

    files = glob.glob(os.path.join(xml_dir, '*.txt' + extension))
    with open(xml_filelist_filename, 'w') as output_file:
        for f in files:
            output_file.write(f + '\n')
    summary, dependencies = parse_xml_files(xml_filelist_filename, output_dir)

    # part 4
    print "Writing summary"
    #parsed_filename = fh.make_filename(output_dir, 'parsed', 'json')
    parse_summary_to_files(summary, dependencies, output_dir)
def write_sentences(f):
    output_dir = fh.makedirs(dirs.data_semafor_dir, 'temp')

    index = 0
    sent_index = {}
    responses = fh.read_json(f)
    keys = responses.keys()
    keys.sort()

    #all_items = ds.get_all_documents()
    #unlabeled = list(set(keys) - all_items)
    #print len(unlabeled)

    for k in keys:
        sentence_filename = os.path.join(output_dir, k + '.txt')
        #index_filename = fh.make_filename(output_dir, fh.get_basename(f), 'json')
        with codecs.open(sentence_filename, 'w', encoding='utf-8') as output_file:
            text = responses[k]
            paragraphs = text.split('\n\n')
            paragraphs = [p for p in paragraphs if p != '']
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p)
                for sent in sentences:
                    sent = sent.lstrip()
                    sent = sent.rstrip()
                    if len(sent) > 0:
                        output_file.write(sent + '\n')
Exemple #3
0
def test_over_time(project_dir, subset, config_file, model_type, field, train_start, train_end, test_start, test_end, n_train=None, n_calib=0, penalty='l2', suffix='', loss='log', objective='f1', do_ensemble=True, dh=300, label='label', intercept=True, n_dev_folds=5, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, nonlinearity='tanh', init_lr=1e-2, min_epochs=2, max_epochs=50, patience=5, tol=1e-4, list_size=1, repeats=1, oracle=False, lower=None, interactive=False, stoplist_file=None, cshift=False, n_cshift=None, do_cfm=True, do_platt=True, dropout=0.0, min_test=None, test_prop=None, verbose=False):
    # Just run a regular model, one per year, training on the past, and save the reults

    if seed is not None:
        seed = int(seed)
        np.random.seed(seed)

    log = {
        'project': project_dir,
        'subset': subset,
        'config_file': config_file,
        'model_type': model_type,
        'field': field,
        'train_start': train_start,
        'train_end': train_end,
        'test_start': test_start,
        'test_end': test_end,
        'n_train': n_train,
        'n_calib': n_calib,
        'penalty': penalty,
        'cshift': cshift,
        'n_cshift': n_cshift,
        'suffix': suffix,
        'loss': loss,
        'objective': objective,
        'do_ensemble': do_ensemble,
        'dh': dh,
        'label': label,
        'intercept': intercept,
        'n_dev_folds': n_dev_folds,
        'average': average,
        'seed': seed,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'n_alphas': n_alphas,
        'sample_labels': sample_labels,
        'group_identical': group_identical,
        'annotated_subset': annotated_subset,
        'nonlinearity': nonlinearity,
        'init_lr': init_lr,
        'min_epochs': min_epochs,
        'max_epochs': max_epochs,
        'patience': patience,
        'tol': tol,
        'interactive': interactive,
        'stoplist_file': stoplist_file,
        'list_size': list_size
    }

    model_basename = make_model_basename(log)

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))

    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field].values))
    field_vals.sort()
    print("Splitting data according to %s", field)
    print("Values:", field_vals)

    print("\nTesting on %s to %s" % (test_start, test_end))

    # first, split into training and non-train data based on the field of interest
    all_items = list(metadata.index)
    test_selector_all = (metadata[field] >= int(test_start)) & (metadata[field] <= int(test_end))
    test_subset_all = metadata[test_selector_all]
    test_items_all = test_subset_all.index.tolist()
    n_test_all = len(test_items_all)

    if min_test is not None:
        if n_test_all < min_test:
            print("Not enough test samples; exiting")
            return

    if train_end is None:
        if train_start is None:
            train_selector_all = metadata[field] < int(test_start)
        else:
            train_selector_all = (metadata[field] < int(test_start)) & (metadata[field] >= train_start)
    else:
        if train_start is None:
            train_selector_all = metadata[field] <= int(train_end)
        else:
            train_selector_all = (metadata[field] <= int(train_end)) & (metadata[field] >= train_start)

    train_subset_all = metadata[train_selector_all]
    train_items_all = list(train_subset_all.index)
    n_train_all = len(train_items_all)
    # only keep the items in the train and test sets
    all_items = train_items_all + test_items_all

    print("Train: %d, Test: %d (labeled and unlabeled)" % (n_train_all, n_test_all))

    # load all labels
    label_dir = dirs.dir_labels(project_dir, subset)
    labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
    labels_df = labels_df.loc[all_items]

    # if desired, attempt to learn weights for the training data using techniques for covariate shift
    if cshift:
        print("Training a classifier for covariate shift")
        # start by learning to discriminate train from non-train data
        # Label items based on whether they come from train or test
        train_test_labels = np.zeros((len(all_items), 2), dtype=int)
        train_test_labels[:n_train_all, 0] = 1
        train_test_labels[n_train_all:, 1] = 1
        if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]):
            cshift_pos_label = 0
        else:
            cshift_pos_label = 1
        train_test_labels_df = pd.DataFrame(train_test_labels, index=all_items, columns=[0, 1])

        if n_cshift is not None and len(all_items) >= n_cshift:
            print("Taking a random sample of %d items for reweighting" % n_cshift)
            #np.random.shuffle(all_items)
            cshift_items = np.random.choice(all_items, size=n_cshift, replace=False)
        else:
            print("Using all train items")
            cshift_items = all_items

        print(train_test_labels_df.loc[cshift_items].mean(axis=0))

        # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.)
        model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + 'cshift'
        model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, items_to_use=cshift_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=False, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False)
        print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc))

        #X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items)
        X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items)
        cshift_pred_probs = model.predict_probs(X_cshift)
        f_items = features_concat.get_items()
        assert len(f_items) == len(all_items)
        for i in range(len(all_items)):
            assert all_items[i] == f_items[i]
        cshift_pred_probs_df = pd.DataFrame(cshift_pred_probs, index=features_concat.get_items(), columns=range(2))

        # display the min and max probs
        print("Min: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].min())
        print("Mean: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].mean())
        print("Max: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].max())
        # HACK: need to prevent 0s in prob(y=0|x)
        p_train_values = cshift_pred_probs_df[0].values
        threshold = 0.01
        p_train_values[p_train_values < threshold] = threshold
        print("After thresholding")
        print("Min: %0.6f" % p_train_values[:n_train_all].min())
        print("Mean: %0.6f" % p_train_values[:n_train_all].mean())
        print("Max: %0.6f" % p_train_values[:n_train_all].max())

        # use the estimated probability of each item being a training item to compute item weights
        weights = n_train_all / float(n_test_all) * (1.0/p_train_values - 1)
        weights_df_all = pd.DataFrame(weights, index=all_items)
        # print a summary of the weights from just the training items
        print("Min weight: %0.4f" % weights[:n_train_all].min())
        print("Ave weight: %0.4f" % weights[:n_train_all].mean())
        print("Max weight: %0.4f" % weights[:n_train_all].max())
        # print a summary of all weights
        #print("Min weight: %0.4f" % weights.min())
        #print("Ave weight: %0.4f" % weights.mean())
        #print("Max weight: %0.4f" % weights.max())
        # create a data frame with this information
    else:
        weights_df_all = None

    # find the labeled items
    print("Subsetting items with labels")
    label_sums_df = labels_df.sum(axis=1)
    labeled_item_selector = label_sums_df > 0
    labels_df = labels_df[labeled_item_selector]
    n_labeled_items, n_classes = labels_df.shape
    print("%d labeled items" % n_labeled_items)
    labeled_items = set(labels_df.index)

    train_items_labeled = [i for i in train_items_all if i in labeled_items]

    test_items = [i for i in test_items_all if i in labeled_items]
    #n_train = len(train_items)
    n_test = len(test_items)

    for r in range(repeats):

        # set seed very explicily here to make sure experiments are comparable
        if seed is not None:
            seed += 1
            np.random.seed(seed)

        print("* Starting repetition %d *" % r)
        model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + '_' + str(r)
        if n_train is not None and len(train_items_labeled) >= n_train:
            np.random.shuffle(train_items_labeled)
            train_items = np.random.choice(train_items_labeled, size=n_train, replace=False)
        else:
            print("Using all train items")
            train_items = train_items_labeled
        n_train_r = len(train_items)

        # now, choose a calibration set
        if n_calib > 0 and n_test >= n_calib:
            np.random.shuffle(test_items)
            calib_items = np.random.choice(test_items, size=n_calib, replace=False)
        elif n_test < n_calib:
            print("Error: Only %d labeled test instances available" % n_test)
            calib_items = test_items
        else:
            calib_items = []

        if weights_df_all is not None:
            weights_df = weights_df_all[labeled_item_selector]
        else:
            weights_df = None

        print("Labeled train: %d, test: %d" % (n_train_r, n_test))

        # create a data frame to hold a summary of the results
        output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test'])

        test_labels_df = labels_df.loc[test_items]
        # do a fake adjustment of the test label proportions
        if test_prop is not None:
            test_prop = float(test_prop)
            test_label_values = test_labels_df.values
            test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0])
            order = list(np.argsort(test_label_props))

            true_prop = np.mean(test_label_props)
            if test_prop < true_prop:
                i = 0
                running = test_label_props[order[i]]
                new_test_items = [test_items[order[i]]]
                i += 1
                while (running / i) <= test_prop:
                    running += test_label_props[order[i]]
                    new_test_items.append(test_items[order[i]])
                    i += 1
                print("Taking %d test_items" % len(new_test_items))
                test_items = new_test_items[:]
            else:
                order.reverse()
                i = 0
                running = test_label_props[order[i]]
                new_test_items = [test_items[order[i]]]
                i += 1
                while (running / i) >= test_prop:
                    running += test_label_props[order[i]]
                    new_test_items.append(test_items[order[i]])
                    i += 1
                print("Taking %d test_items" % len(new_test_items))
                test_items = new_test_items[:]

            test_labels_df = labels_df.loc[test_items]
            test_label_values = test_labels_df.values
            test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0])
            print("New props = %0.3f" % np.mean(test_label_props))

        # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
        if sample_labels:
            print("Sampling labels")
            # normalize the labels
            temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_labeled_items, 1)), dtype=float)
            samples = np.zeros([n_labeled_items, n_classes], dtype=int)
            for i in range(n_labeled_items):
                index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                samples[i, index] = 1
            sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
        else:
            sampled_labels_df = labels_df

        train_labels_df = sampled_labels_df.loc[train_items].copy()
        if n_calib > 0:
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()
        else:
            calib_labels_df = None

        # get the true proportion of labels in the test OR non-training data (calibration and test combined)
        target_props, target_estimate, target_std = get_estimate_and_std(test_labels_df, use_n_annotations=True)
        output_df.loc['target'] = [n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan]

        # get the same estimate from training data
        train_props, train_estimate, train_std = get_estimate_and_std(train_labels_df, use_n_annotations=True)
        print("Train props:", train_props, train_estimate)
        train_rmse = np.abs(train_estimate - target_estimate)
        train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
        output_df.loc['train'] = [n_train_r, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

        # get the same estimate from training data
        if n_calib > 0:
            calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df, use_n_annotations=True)
            # compute the error of this estimate
            calib_rmse = np.abs(calib_estimate - target_estimate)
            calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and target_estimate < calib_estimate + 2 * calib_std
            output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test]
        else:
            calib_estimate = 0.0
            calib_std = 1.0
            output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', np.nan, np.nan, np.nan, np.nan, np.nan]

        if train_estimate > 0.5:
            pos_label = 0
        else:
            pos_label = 1
        print("Using %d as the positive label" % pos_label)

        results_df = pd.DataFrame([], columns=['f1', 'acc', 'mae', 'estimated calibration'])

        # Now train a model on the training data, saving the calibration data for calibration

        if stoplist_file is not None:
            stoplist = fh.read_text(stoplist_file)
            stoplist = {s.strip() for s in stoplist}
            print(stoplist)
        else:
            stoplist = None

        print("Training a LR model")
        model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(project_dir, model_type, 'log', model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=None, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, do_cfm=do_cfm, do_platt=do_platt, lower=lower, stoplist=stoplist, dropout=dropout, verbose=verbose)
        results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal_mae, dev_cal_est]

        X_test, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=test_items)
        test_predictions = model.predict(X_test)
        test_predictions_df = pd.DataFrame(test_predictions, index=features_concat.get_items(), columns=[label])
        test_pred_probs = model.predict_probs(X_test)
        _, n_labels = test_pred_probs.shape
        test_pred_probs_df = pd.DataFrame(test_pred_probs, index=features_concat.get_items(), columns=range(n_labels))

        f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
        true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
        test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
        test_cc_estimate, test_pcc_estimate = model.predict_proportions(X_test)

        test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate))
        test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate))

        results_df.loc['test'] = [f1_test, acc_test, test_pcc_mae, test_cal_est]

        output_df.loc['CC'] = [n_train_r, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan]
        output_df.loc['PCC'] = [n_train_r, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan]

        test_acc_estimate_internal, test_acc_ms_estimate_internal = model.predict_proportions(X_test, do_cfm=do_cfm)

        test_acc_rmse_internal = np.abs(test_acc_estimate_internal[1] - target_estimate)
        test_acc_ms_rmse_internal = np.abs(test_acc_ms_estimate_internal[1] - target_estimate)

        output_df.loc['ACC_internal'] = [n_train_r, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan]
        output_df.loc['MS_internal'] = [n_train_r, 'train', 'nontrain', 'predicted', test_acc_ms_estimate_internal[1], test_acc_ms_rmse_internal, np.nan, np.nan, np.nan]

        test_platt1_estimate, test_platt2_estimate = model.predict_proportions(X_test, do_platt=do_platt)

        test_platt1_rmse = np.abs(test_platt1_estimate[1] - target_estimate)
        test_platt2_rmse = np.abs(test_platt2_estimate[1] - target_estimate)

        output_df.loc['PCC_platt1'] = [n_train_r, 'train', 'test', 'n/a', test_platt1_estimate[1], test_platt1_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_platt2'] = [n_train_r, 'train', 'nontrain', 'predicted', test_platt2_estimate[1], test_platt2_rmse, np.nan, np.nan, np.nan]

        if n_calib > 0:
            cc_plus_cal_estimate = (test_cc_estimate[1] + calib_estimate) / 2.0
            pcc_plus_cal_estimate = (test_pcc_estimate[1] + calib_estimate) / 2.0
            cc_plus_cal_mae = np.mean(np.abs(cc_plus_cal_estimate - target_estimate))
            pcc_plus_cal_mae = np.mean(np.abs(pcc_plus_cal_estimate - target_estimate))

            #output_df.loc['CC_plus_cal'] = [n_train, 'train', 'test', 'n/a', cc_plus_cal_estimate, cc_plus_cal_mae, np.nan, np.nan, np.nan]
            output_df.loc['PCC_plus_cal'] = [n_train_r, 'train', 'test', 'n/a', pcc_plus_cal_estimate, pcc_plus_cal_mae, np.nan, np.nan, np.nan]

        results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))
        output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))

        """
Exemple #4
0
def cross_train_and_eval(project_dir,
                         reference_model_dir,
                         subset,
                         field_name,
                         config_file,
                         n_train=100,
                         field_val=None,
                         vocab_file=None,
                         group_identical=False,
                         suffix='',
                         model_type='MLP',
                         loss='log',
                         do_ensemble=True,
                         dh=100,
                         label='label',
                         n_dev_folds=5,
                         repeats=1,
                         verbose=False,
                         average='micro',
                         objective='calibration',
                         seed=None,
                         init_lr=1e-4,
                         min_epochs=2,
                         max_epochs=50,
                         early_stopping=False,
                         tol=1e-4,
                         patience=8):
    n_calib = 0
    model_basename = subset + '_' + label + '_' + field_name + '_' + model_type
    if model_type == 'MLP':
        model_basename += '_' + str(dh)
    model_basename += '_' + str(n_train) + '_' + str(n_calib) + '_' + objective
    model_basename += suffix

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'n_calib': n_calib,
        'n_train': n_train,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'do_ensemble': do_ensemble,
        'label': label,
        'field_val': field_val,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'average': average,
        'objective': objective,
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print("Splitting data according to :", field_vals)
    print(field_vals)

    if field_val is not None:
        field_vals = [field_val]

    # repeat the following value for each fold of the partition of interest (up to max_folds, if given)
    for v_i, v in enumerate(field_vals):
        print("\nTesting on %s" % v)
        # first, split into training and non-train data based on the field of interest
        train_selector = metadata[field_name] != v
        train_subset = metadata[train_selector]
        train_items = list(train_subset.index)
        n_train_cshift = len(train_items)

        non_train_selector = metadata[field_name] == v
        non_train_subset = metadata[non_train_selector]
        non_train_items = non_train_subset.index.tolist()
        n_non_train_cshift = len(non_train_items)

        print("Train: %d, non-train: %d" %
              (n_train_cshift, n_non_train_cshift))

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'),
                                      index_col=0,
                                      header=0)
        n_items, n_classes = labels_df.shape

        weights_df = None

        # add in a stage to eliminate items with no labels?
        print("Subsetting items with labels")
        label_sums_df = labels_df.sum(axis=1)
        labeled_item_selector = label_sums_df > 0
        labels_df = labels_df[labeled_item_selector]
        n_items, n_classes = labels_df.shape
        labeled_items = set(labels_df.index)

        train_items = [i for i in train_items if i in labeled_items]
        non_train_items = [i for i in non_train_items if i in labeled_items]
        n_non_train = len(non_train_items)

        if weights_df is not None:
            weights_df = weights_df[labeled_item_selector]

        print("Starting repeats")
        # repeat the following process multiple times with different random splits of train / calibration / test data
        for r in range(repeats):
            print("* Repetition %d *" % r)
            # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items
            if n_train > 0:
                np.random.shuffle(train_items)
                train_items_r = np.random.choice(train_items,
                                                 size=n_train,
                                                 replace=False)
            else:
                train_items_r = train_items

            n_train_r = len(train_items_r)

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'training data', 'test data',
                                         'cal', 'estimate', 'RMSE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])
            # create a unique name ofr this model
            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # now, divide the non-train data into a calibration and a test set
            #n_calib = int(calib_prop * n_non_train)
            np.random.shuffle(non_train_items)
            if n_calib > n_non_train:
                n_calib = int(n_non_train / 2)
                print(
                    "Warning!!: only %d non-train items; using 1/2 for calibration"
                    % n_non_train)

            calib_items = non_train_items[:n_calib]
            test_items = non_train_items[n_calib:]
            n_test = len(test_items)

            print("Train: %d, calibration: %d, test: %d" %
                  (n_train_r, n_calib, n_test))
            test_labels_df = labels_df.loc[test_items]
            non_train_labels_df = labels_df.loc[non_train_items]

            sampled_labels_df = labels_df

            train_labels_r_df = sampled_labels_df.loc[train_items_r].copy()
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            target_props, target_estimate, target_std = get_estimate_and_std(
                non_train_labels_df)
            output_df.loc['target'] = [
                n_test, 'nontrain', 'nontrain', 'given', target_estimate, 0,
                target_estimate - 2 * target_std,
                target_estimate + 2 * target_std, np.nan
            ]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(
                train_labels_r_df)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - target_estimate)**2)
            train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [
                n_train_r, 'train', 'train', 'n/a', train_estimate, train_rmse,
                np.nan, np.nan, np.nan
            ]

            print(
                "target proportions: (%0.3f, %0.3f); train proportions: %0.3f"
                % (target_estimate - 2 * target_std,
                   target_estimate + 2 * target_std, train_estimate))

            if train_estimate > 0.5:
                pos_label = 0
            else:
                pos_label = 1
            print("Using %d as the positive label" % pos_label)

            # repeat for labeled calibration data
            if n_calib > 0:
                calib_props, calib_estimate, calib_std = get_estimate_and_std(
                    calib_labels_df)
                calib_rmse = np.sqrt((calib_estimate - target_estimate)**2)
                # check if the test estimate is within 2 standard deviations of the estimate
                calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std
                output_df.loc['calibration'] = [
                    n_calib, 'calibration', 'nontrain', 'given',
                    calib_estimate, calib_rmse, calib_estimate - 2 * calib_std,
                    calib_estimate + 2 * calib_std, calib_contains_test
                ]

                # do a test using the number of annotations rather than the number of items
                calib_props2, calib_estimate2, calib_std2 = get_estimate_and_std(
                    calib_labels_df, use_n_annotations=True)
                calib_rmse2 = np.sqrt((calib_estimate2 - target_estimate)**2)
                calib_contains_test2 = target_estimate > calib_estimate2 - 2 * calib_std2 and calib_estimate < calib_estimate2 + 2 * calib_std2
                output_df.loc['calibration_n_annotations'] = [
                    n_calib, 'calibration', 'nontrain', 'given',
                    calib_estimate2, calib_rmse2,
                    calib_estimate2 - 2 * calib_std2,
                    calib_estimate2 + 2 * calib_std2, calib_contains_test2
                ]

            results_df = pd.DataFrame(
                [], columns=['f1', 'acc', 'calibration', 'calib overall'])

            # Now train a model on the training data, saving the calibration data for calibration

            print("Training model on training data only")
            model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_brier_grouped(
                project_dir,
                model_name,
                subset,
                sampled_labels_df,
                feature_defs,
                weights_df=weights_df,
                vocab_file=vocab_file,
                group_identical=group_identical,
                items_to_use=train_items_r,
                intercept=True,
                n_dev_folds=n_dev_folds,
                do_ensemble=do_ensemble,
                dh=dh,
                seed=seed,
                pos_label=pos_label,
                verbose=verbose,
                init_lr=init_lr,
                min_epochs=min_epochs,
                max_epochs=max_epochs,
                early_stopping=early_stopping,
                tol=tol,
                patience=patience)
            results_df.loc['cross_val'] = [
                dev_f1, dev_acc, dev_cal, dev_cal_overall
            ]

            # predict on calibration data
            if n_calib > 0:
                calib_predictions_df, calib_pred_probs_df, calib_pred_proportions = predict.predict(
                    project_dir,
                    model,
                    model_name,
                    subset,
                    label,
                    items_to_use=calib_items,
                    verbose=verbose,
                    force_dense=True)
                calib_cc, calib_pcc, calib_acc, calib_pvc = calib_pred_proportions
                f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(
                    calib_labels_df,
                    calib_predictions_df,
                    calib_pred_probs_df,
                    pos_label=pos_label,
                    average=average,
                    verbose=False)
                true_calib_vector = np.argmax(calib_labels_df.as_matrix(),
                                              axis=1)
                calib_cal_rmse = evaluation.evaluate_calibration_rmse(
                    true_calib_vector, calib_pred_probs_df.as_matrix())
                calib_cal_rmse_overall = evaluation.evaluate_calibration_rmse(
                    true_calib_vector,
                    calib_pred_probs_df.as_matrix(),
                    min_bins=1,
                    max_bins=1)
                results_df.loc['calibration'] = [
                    f1_cal, acc_cal, calib_cal_rmse, calib_cal_rmse_overall
                ]

            # predict on test data
            test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose,
                force_dense=True)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels_df,
                test_predictions_df,
                test_pred_probs_df,
                pos_label=pos_label,
                average=average)
            true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
            test_cal_rmse = evaluation.evaluate_calibration_rmse(
                true_test_vector, test_pred_probs_df.as_matrix())
            test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(
                true_test_vector,
                test_pred_probs_df.as_matrix(),
                min_bins=1,
                max_bins=1)
            results_df.loc['test'] = [
                f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall
            ]
            test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

            # predict on calibration and test data combined
            nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=non_train_items,
                verbose=verbose,
                force_dense=True)
            nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions

            if n_calib > 0:
                cc_calib_rmse = np.sqrt((calib_cc[1] - calib_estimate)**2)
                output_df.loc['CC_cal'] = [
                    n_non_train, 'train', 'calibration', 'predicted',
                    calib_cc[1], cc_calib_rmse, np.nan, np.nan, np.nan
                ]

                pcc_calib_rmse = np.sqrt((calib_pcc[1] - calib_estimate)**2)
                output_df.loc['PCC_cal'] = [
                    n_non_train, 'train', 'calibration', 'predicted',
                    calib_pcc[1], pcc_calib_rmse, np.nan, np.nan, np.nan
                ]

            cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2)
            pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2)

            output_df.loc['CC_nontrain'] = [
                n_non_train, 'train', 'nontrain', 'predicted',
                nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan
            ]
            output_df.loc['PCC_nontrain'] = [
                n_non_train, 'train', 'nontrain', 'predicted',
                nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan
            ]

            if n_calib > 0:
                averaged_cc_estimate = (
                    test_cc_estimate[1] * n_test +
                    calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pcc_estimate = (
                    test_pcc_estimate[1] * n_test +
                    calib_estimate * n_calib) / float(n_test + n_calib)

                averaged_cc_rmse = np.sqrt(
                    (averaged_cc_estimate - target_estimate)**2)
                averaged_pcc_rmse = np.sqrt(
                    (averaged_pcc_estimate - target_estimate)**2)

                output_df.loc['CC_nontrain_averaged'] = [
                    n_non_train, 'train', 'nontrain', 'given',
                    averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan,
                    np.nan
                ]
                output_df.loc['PCC_nontrain_averaged'] = [
                    n_non_train, 'train', 'nontrain', 'given',
                    averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan,
                    np.nan
                ]
            """
            nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2)
            nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2)

            output_df.loc['ACC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan]
            output_df.loc['PVC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan]

            if n_calib > 0:
                averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2)
                averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2)

                output_df.loc['ACC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan]
                output_df.loc['PVC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan]

            # do calibration here using calibration data
            if n_calib > 0:
                # expand the data so as to only have singly-labeled, weighted items
                _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values)

                #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values)
                acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
                acc_corrected = calibration.apply_acc_binary(nontrain_predictions_df.values, acc)
                acc_estimate = acc_corrected[1]
                acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2)
                output_df.loc['ACC'] = [n_non_train, 'train', 'nontrain', 'predicted', acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

                pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
                pvc_corrected = calibration.apply_pvc(nontrain_predictions_df.values, pvc)
                pvc_estimate = pvc_corrected[1]
                pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2)
                output_df.loc['PVC'] = [n_non_train, 'train', 'nontrain', 'predicted', pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

                acc_corrected = calibration.apply_acc_binary(test_predictions_df.values, acc)
                acc_estimate = acc_corrected[1]
                averaged_acc_estimate = (acc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2)
                output_df.loc['ACC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate, averaged_acc_rmse, np.nan, np.nan, np.nan]

                pvc_corrected = calibration.apply_pvc(test_predictions_df.values, pvc)
                pvc_estimate = pvc_corrected[1]
                averaged_pvc_estimate = (pvc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2)
                output_df.loc['PVC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate, averaged_pvc_rmse, np.nan, np.nan, np.nan]

            print("Venn internal nontrain")
            #models = list(model._models.values())
            nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items)

            pred_range = np.mean(nontrain_pred_ranges_internal, axis=0)
            venn_estimate = np.mean(nontrain_preds_internal)

            venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
            venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
            output_df.loc['Venn_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

            if n_calib > 0:
                print("Venn internal test")
                test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

                pred_range = np.mean(test_pred_ranges_internal, axis=0)
                venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

                averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                venn_contains_test = averaged_lower < target_estimate < averaged_upper

                output_df.loc['Venn_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

                # Venn prediction using proper calibration data
                print("Venn calibration")
                calib_pred_ranges, calib_preds, calib_props_in_range, list_of_n_levels = ivap.estimate_probs_from_labels_cv(project_dir, model, model_name, sampled_labels_df, subset, calib_items=calib_items)
                print("Venn test")
                test_pred_ranges, test_preds = ivap.estimate_probs_from_labels(project_dir, model, model_name, sampled_labels_df, subset, subset, calib_items=calib_items, test_items=test_items)

                nontrain_pred_ranges = np.vstack([calib_pred_ranges, test_pred_ranges])
                nontrain_preds = np.r_[calib_preds, test_preds]

                nontrain_pred_range = np.mean(nontrain_pred_ranges, axis=0)
                nontrain_venn_estimate = np.mean(nontrain_preds)
                nontrain_venn_rmse = np.sqrt((nontrain_venn_estimate - target_estimate)**2)
                nontrain_contains_test = nontrain_pred_range[0] < target_estimate < nontrain_pred_range[1]
                output_df.loc['Venn'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_venn_estimate, nontrain_venn_rmse, nontrain_pred_range[0], nontrain_pred_range[1], nontrain_contains_test]

                test_pred_range = np.mean(test_pred_ranges, axis=0)
                averaged_venn_estimate = (np.mean(test_preds) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_venn_rmse = np.sqrt((averaged_venn_estimate - target_estimate)**2)

                averaged_lower = (test_pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                averaged_upper = (test_pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                venn_contains_test = averaged_lower < target_estimate < averaged_upper

                output_df.loc['Venn_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_venn_estimate, averaged_venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

                fh.write_list_to_text(calib_props_in_range, os.path.join(dirs.dir_models(project_dir), model_name, 'venn_calib_props_in_range.csv'))
                fh.write_list_to_text(list_of_n_levels, os.path.join(dirs.dir_models(project_dir), model_name, 'list_of_n_levels.csv'))
                results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))

            # now train a model on the training and calibration data combined
            if run_all:
                print("Training model on all labeled data")
                calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r))
                model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose)
                results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall]

                # get labels for test data
                test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
                f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
                test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions
                true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
                test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix())
                results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, 0]
                results_df.loc['test_all'] = [f1_test, acc_test, test_cal_rmse, 0]

                nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose)
                nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions

                cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2)
                pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2)

                output_df.loc['CC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan]
                output_df.loc['PCC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan]

                if n_calib > 0:
                    averaged_cc_estimate = (test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)

                    averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2)
                    averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2)

                    output_df.loc['CC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan]
                    output_df.loc['PCC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan]

                nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2)
                nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2)

                output_df.loc['ACC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan]
                output_df.loc['PVC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan]

                if n_calib > 0:
                    averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2)
                    averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2)

                    output_df.loc['ACC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan]
                    output_df.loc['PVC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan]

                print("Venn internal nontrain")
                nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items)

                pred_range = np.mean(nontrain_pred_ranges_internal, axis=0)
                venn_estimate = np.mean(nontrain_preds_internal)

                venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
                venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
                output_df.loc['Venn_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

                if n_calib > 0:
                    print("Venn internal test")
                    test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

                    pred_range = np.mean(test_pred_ranges_internal, axis=0)
                    venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

                    averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                    averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                    venn_contains_test = averaged_lower < target_estimate < averaged_upper

                    output_df.loc['Venn_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

            """
            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'accuracy.csv'))
            output_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))
Exemple #5
0
def test_over_time(project_dir,
                   subset,
                   config_file,
                   first_year,
                   stage1_logfile=None,
                   penalty='l2',
                   suffix='',
                   model_type='LR',
                   loss='log',
                   objective='f1',
                   do_ensemble=True,
                   dh=100,
                   label='label',
                   intercept=True,
                   n_dev_folds=5,
                   verbose=False,
                   average='micro',
                   seed=None,
                   alpha_min=0.01,
                   alpha_max=1000.0,
                   n_alphas=8,
                   sample_labels=False,
                   group_identical=False,
                   annotated_subset=None,
                   n_terms=0,
                   nonlinearity='tanh',
                   init_lr=1e-4,
                   min_epochs=2,
                   max_epochs=100,
                   patience=8,
                   tol=1e-4,
                   early_stopping=False,
                   DL=False):
    # Just run a regular model, one per year, training on the past, and save the reults

    log = {
        'project': project_dir,
        'subset': subset,
        'config_file': config_file,
        'first_year': first_year,
        'stage1_logfile': stage1_logfile,
        'penalty': penalty,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'objective': objective,
        'do_ensemble': do_ensemble,
        'dh': dh,
        'label': label,
        'intercept': intercept,
        'n_dev_folds': n_dev_folds,
        'average': average,
        'seed': seed,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'n_alphas': n_alphas,
        'sample_labels': sample_labels,
        'group_identical': group_identical,
        'annotated_subset': annotated_subset,
        'n_terms': n_terms,
        'nonlinearity': nonlinearity,
        'init_lr': init_lr,
        'min_epochs': min_epochs,
        'max_epochs': max_epochs,
        'patience': patience,
        'tol': tol,
        'early_stopping': early_stopping
    }

    model_basename = make_model_basename(log)
    stage1_model_basename = ''
    if stage1_logfile is not None:
        stage1_log = fh.read_json(stage1_logfile)
        stage1_model_basename = make_model_basename(stage1_log)

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))

    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata['year'].values))
    field_vals.sort()
    print("Splitting data according to :", field_vals)

    # DEBUG:
    field_vals = ['2009']

    for target_year in field_vals:
        if int(target_year) >= first_year:
            print("\nTesting on %s" % target_year)
            model_name = model_basename + '_' + str(target_year)
            stage1_model_name = stage1_model_basename + '_' + str(target_year)
            # first, split into training and non-train data based on the field of interest

            ## DEBUG!
            test_selector_all = metadata['year'] >= int(target_year)
            test_subset_all = metadata[test_selector_all]
            test_items_all = test_subset_all.index.tolist()
            n_test_all = len(test_items_all)

            train_selector_all = metadata['year'] < int(target_year)
            train_subset_all = metadata[train_selector_all]
            train_items_all = list(train_subset_all.index)
            n_train_all = len(train_items_all)

            print("Test year: %d Train: %d, Test: %d (labeled and unlabeled)" %
                  (int(target_year), n_train_all, n_test_all))

            # load all labels
            label_dir = dirs.dir_labels(project_dir, subset)
            labels_df = fh.read_csv_to_df(os.path.join(label_dir,
                                                       label + '.csv'),
                                          index_col=0,
                                          header=0)
            n_items, n_classes = labels_df.shape

            vocab = None
            if stage1_logfile is not None:

                fightin_lexicon = None
                if annotated_subset is not None:
                    print("Determining fightin' words")
                    fightin_words.find_most_annotated_features(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=train_items_all,
                        remove_stopwords=False)
                    fightin_lexicon, scores = fightin_words.load_from_config_files(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=train_items_all,
                        n=n_terms,
                        remove_stopwords=True)
                    fightin_lexicon_test, scores = fightin_words.load_from_config_files(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=test_items_all,
                        n=n_terms,
                        remove_stopwords=True)
                    print(fightin_lexicon)
                    #print(fightin_lexicon_test)
                    #vocab = list(fightin_lexicon)
                    #vocab.sort()

                print("Loading feature from stage 1")
                # load features from previous model
                top_features = get_top_features.get_top_features(
                    os.path.join(dirs.dir_models(project_dir),
                                 stage1_model_name), n_terms)
                lr_features, weights = zip(*top_features)
                vocab = list(lr_features)

                #if annotated_subset is not None:
                #    print("\nTaking intersection:")
                #    intersection = set(lr_features).intersection(set(fightin_lexicon))
                #    vocab = list(intersection)
                #    vocab.sort()
                #    for w in vocab:
                #        print(w)

                #vocab = [w for w in vocab if w not in stopwords]

                for w in vocab:
                    print(w)

                vocab.sort()

                #if annotated_subset is not None:
                #    print("Missing:")
                #    print(set(fightin_lexicon_test) - set(vocab))

            # add in a stage to eliminate items with no labels
            print("Subsetting items with labels")
            label_sums_df = labels_df.sum(axis=1)
            labeled_item_selector = label_sums_df > 0
            labels_df = labels_df[labeled_item_selector]
            n_items, n_classes = labels_df.shape
            labeled_items = set(labels_df.index)

            train_items = [i for i in train_items_all if i in labeled_items]
            test_items = [i for i in test_items_all if i in labeled_items]
            n_train = len(train_items)
            n_test = len(test_items)

            weights_df = None
            if weights_df is not None:
                weights_df = weights_df[labeled_item_selector]

            print("Labeled train: %d, test: %d" % (n_train, n_test))

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'training data', 'test data',
                                         'cal', 'estimate', 'MAE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])

            test_labels_df = labels_df.loc[test_items]

            # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
            if sample_labels:
                print("Sampling labels")
                # normalize the labels
                temp = labels_df.values / np.array(
                    labels_df.values.sum(axis=1).reshape((n_items, 1)),
                    dtype=float)
                samples = np.zeros([n_items, n_classes], dtype=int)
                for i in range(n_items):
                    index = np.random.choice(np.arange(n_classes),
                                             size=1,
                                             p=temp[i, :])
                    samples[i, index] = 1
                sampled_labels_df = pd.DataFrame(samples,
                                                 index=labels_df.index,
                                                 columns=labels_df.columns)
            else:
                sampled_labels_df = labels_df

            train_labels_df = sampled_labels_df.loc[train_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            target_props, target_estimate, target_std = get_estimate_and_std(
                test_labels_df, use_n_annotations=True)

            output_df.loc['target'] = [
                n_test, 'test', 'test', 'n/a', target_estimate, 0,
                target_estimate - 2 * target_std,
                target_estimate + 2 * target_std, np.nan
            ]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(
                train_labels_df, use_n_annotations=True)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - target_estimate)**2)
            train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [
                n_train, 'train', 'test', 'n/a', train_estimate, train_rmse,
                train_estimate - 2 * train_std, train_estimate + 2 * train_std,
                train_contains_test
            ]

            #print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate))

            if train_estimate > 0.5:
                pos_label = 0
            else:
                pos_label = 1
            print("Using %d as the positive label" % pos_label)

            results_df = pd.DataFrame(
                [], columns=['f1', 'acc', 'mae', 'estimated calibration'])

            # Now train a model on the training data, saving the calibration data for calibration
            print("Training a model")
            model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(
                project_dir,
                model_type,
                loss,
                model_name,
                subset,
                sampled_labels_df,
                feature_defs,
                weights_df=weights_df,
                items_to_use=train_items,
                penalty='l2',
                alpha_min=alpha_min,
                alpha_max=alpha_max,
                n_alphas=n_alphas,
                intercept=intercept,
                objective=objective,
                n_dev_folds=n_dev_folds,
                do_ensemble=do_ensemble,
                dh=dh,
                seed=seed,
                pos_label=pos_label,
                vocab=vocab,
                group_identical=group_identical,
                nonlinearity=nonlinearity,
                init_lr=init_lr,
                min_epochs=min_epochs,
                max_epochs=max_epochs,
                patience=patience,
                tol=tol,
                early_stopping=early_stopping,
                verbose=verbose)
            results_df.loc['cross_val'] = [
                dev_f1, dev_acc, dev_cal_mae, dev_cal_est
            ]

            # predict on test data
            force_dense = False
            if model_type == 'MLP':
                force_dense = True
            test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose,
                force_dense=force_dense,
                group_identical=group_identical)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels_df,
                test_predictions_df,
                test_pred_probs_df,
                pos_label=pos_label,
                average=average)
            true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
            #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix())
            test_cal_est = evaluation.evaluate_calibration_rmse(
                true_test_vector,
                test_pred_probs_df.as_matrix(),
                min_bins=1,
                max_bins=1)
            test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

            test_cc_mae = np.mean(np.abs(test_cc_estimate[1] -
                                         target_estimate))
            test_pcc_mae = np.mean(
                np.abs(test_pcc_estimate[1] - target_estimate))

            results_df.loc['test'] = [
                f1_test, acc_test, test_pcc_mae, test_cal_est
            ]

            output_df.loc['CC_test'] = [
                n_train, 'train', 'test', 'n/a', test_cc_estimate[1],
                test_cc_mae, np.nan, np.nan, np.nan
            ]
            output_df.loc['PCC_test'] = [
                n_train, 'train', 'test', 'n/a', test_pcc_estimate[1],
                test_pcc_mae, np.nan, np.nan, np.nan
            ]

            test_acc_rmse_internal = np.sqrt(
                (test_acc_estimate_internal[1] - target_estimate)**2)
            test_pvc_rmse_internal = np.sqrt(
                (test_pvc_estimate_internal[1] - target_estimate)**2)

            output_df.loc['ACC_internal'] = [
                n_train, 'train', 'test', 'n/a', test_acc_estimate_internal[1],
                test_acc_rmse_internal, np.nan, np.nan, np.nan
            ]
            output_df.loc['PVC_internal'] = [
                n_train, 'train', 'nontrain', 'predicted',
                test_pvc_estimate_internal[1], test_pvc_rmse_internal, np.nan,
                np.nan, np.nan
            ]
            """
            if DL:
                print("Training a model")
                model_type = 'DL'
                DL_model_name = model_name + '_DL'
                model, _, _, _, _ = train.train_model_with_labels(project_dir, model_type, loss, DL_model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose)

                # predict on test data
                force_dense = False
                if model_type == 'MLP':
                    force_dense = True
                test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, DL_model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical)
                f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
                true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)

                #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix())
                test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
                test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

                test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate))
                test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate))

                output_df.loc['CC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan]
                output_df.loc['PCC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan]
            """

            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'accuracy.csv'))
            output_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))
Exemple #6
0
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, train_prop=1.0, prefix=None, max_folds=None, min_val=None, max_val=None, model_type='LR', loss='log', do_ensemble=False, dh=0, label='label', penalty='l1', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1', seed=None, use_calib_pred=False, exclude_calib=False, alpha_min=0.01, alpha_max=1000, sample_labels=False):

    model_basename = subset + '_' + field_name
    if prefix is not None:
        model_basename = prefix + '_' + model_basename

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'calib_prop': calib_prop,
        'train_prop': train_prop,
        'prefix': prefix,
        'max_folds': max_folds,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'do_ensemble': do_ensemble,
        'label': label,
        'penalty': penalty,
        'cshift': cshift,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'pos_label': pos_label,
        'average': average,
        'use_calib_pred': use_calib_pred,
        'exclude_calib': exclude_calib
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print(field_vals)

    # exclude certain values of the partition if desired
    if min_val is not None:
        field_vals = [v for v in field_vals if v >= float(min_val)]

    if max_val is not None:
        field_vals = [v for v in field_vals if v <= float(max_val)]

    if max_folds is None:
        max_folds = len(field_vals)

    # repeat the following value for each fold of the partition of interest (up to max_folds, if given)
    for v_i, v in enumerate(field_vals[:max_folds]):
        print("\nTesting on %s" % v)
        # first, split into training and non-train data based on the field of interest
        train_selector = metadata[field_name] != v
        train_subset = metadata[train_selector]
        train_items = list(train_subset.index)
        n_train = len(train_items)

        non_train_selector = metadata[field_name] == v
        non_train_subset = metadata[non_train_selector]
        non_train_items = non_train_subset.index.tolist()
        n_non_train = len(non_train_items)

        print("Train: %d, non-train: %d" % (n_train, n_non_train))

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
        n_items, n_classes = labels_df.shape
        train_labels = labels_df.loc[train_items]

        # if desired, attempt to learn weights for the training data using techniques for covariate shift
        if cshift is not None:
            print("Training a classifier for covariate shift")
            # start by learning to discriminate train from non-train data
            train_test_labels = np.zeros((n_items, 2), dtype=int)
            train_test_labels[train_selector, 0] = 1
            train_test_labels[non_train_selector, 1] = 1
            train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1])
            # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.)
            model_name = model_basename + '_' + str(v) + '_' + 'cshift'
            model, dev_f1, dev_acc, dev_cal, _, _ = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=False)
            print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc))

            # take predictions from model on the training data
            train_test_pred_df, train_test_probs_df = predict.predict(project_dir, model, model_name, subset, label, verbose=verbose)
            # display the min and max probs
            print("Min: %0.4f" % train_test_probs_df[1].min())
            print("Max: %0.4f" % train_test_probs_df[1].max())
            # use the estimated probability of each item being a training item to compute item weights
            weights = n_train / float(n_non_train) * (1.0/train_test_probs_df[0].values - 1)
            # print a summary of the weights from just the training items
            print("Min weight: %0.4f" % weights[train_selector].min())
            print("Ave weight: %0.4f" % weights[train_selector].mean())
            print("Max weight: %0.4f" % weights[train_selector].max())
            # print a summary of all weights
            print("Min weight: %0.4f" % weights.min())
            print("Ave weight: %0.4f" % weights.mean())
            print("Max weight: %0.4f" % weights.max())
            # create a data frame with this information
            weights_df = pd.DataFrame(weights, index=labels_df.index)
        else:
            weights_df = None

        # repeat the following process multiple times with different random splits of train / calibration / test data
        for r in range(repeats):

            # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items
            if train_prop < 1.0:
                np.random.shuffle(train_items)
                train_items_r = np.random.choice(train_items, size=int(n_train * train_prop), replace=False)
                n_train_r = len(train_items_r)

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([], columns=['N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test'])
            # create a unique name ofr this model
            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # now, divide the non-train data into a calibration and a test set
            n_calib = int(calib_prop * n_non_train)
            np.random.shuffle(non_train_items)
            calib_items = non_train_items[:n_calib]
            test_items = non_train_items[n_calib:]
            n_test = len(test_items)

            print("%d %d %d" % (n_train_r, n_calib, n_test))
            test_labels_df = labels_df.loc[test_items]
            non_train_labels_df = labels_df.loc[non_train_items]

            # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
            if sample_labels:
                print("Sampling labels")
                # normalize the labels
                temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float)
                samples = np.zeros([n_items, n_classes], dtype=int)
                for i in range(n_items):
                    index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                    samples[i, index] = 1
                sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
            else:
                sampled_labels_df = labels_df

            train_labels_r_df = sampled_labels_df.loc[train_items_r].copy()
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            if exclude_calib:
                test_props, test_estimate, test_std = get_estimate_and_std(test_labels_df)
            else:
                test_props, test_estimate, test_std = get_estimate_and_std(non_train_labels_df)
            output_df.loc['test'] = [n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(train_labels_r_df)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - test_estimate)**2)
            train_contains_test = test_estimate > train_estimate - 2 * train_std and test_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [n_train_r, train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

            # repeat for calibration data
            calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df)
            calib_rmse = np.sqrt((calib_estimate - test_estimate)**2)
            # check if the test estimate is within 2 standard deviations of the estimate
            calib_contains_test = test_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std
            output_df.loc['calibration'] = [n_calib, calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test]

            results_df = pd.DataFrame([], columns=['f1', 'acc', 'cal'])

            print("Training model on all labeled data")
            # first train a model on the training and calibration data combined
            calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r))
            model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose)
            results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal]

            # get labels for test data
            test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
            results_df.loc['test_all'] = [f1_test, acc_test, 0.0]

            # combine the predictions on the test and calibration data (unless excluding calibration data from this)
            if exclude_calib:
                test_predictions = test_predictions_df.values
                test_pred_probs = test_pred_probs_df.values
            else:
                # get labels for calibration data
                if use_calib_pred:
                    calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose)
                else:
                    calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index)
                    # normalize labels to get (questionable) estimates of probabilities
                    calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index)

                test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values]
                test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values])

            # get the basic error estimates for this model
            cc_estimate = np.mean(test_predictions)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[:, 1])
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            output_df.loc['CC_all'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan]
            output_df.loc['PCC_all'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan]

            # Now repeat for a model trained on the training data, saving the calibration data for calibration
            print("Training model on training data only")
            model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max,  intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose)
            results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal]

            # predict on calibration data
            calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose)
            f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False)
            results_df.loc['calibration'] = [f1_cal, acc_cal, calib_rmse]

            # predict on test data
            test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
            results_df.loc['test'] = [f1_test, acc_test, 0.0]
            results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))

            # combine the predictions on the test and calibration data (unless excluding calibration data from this)
            if exclude_calib:
                test_predictions = test_predictions_df.values
                test_pred_probs = test_pred_probs_df.values
            else:
                if not use_calib_pred:
                    calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index)
                    # normalize labels to get (questionable) estimates of probabilities
                    calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index)

                test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values]
                test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values])

            # now evaluate in terms of predicted proportions
            # average the predictions (assuming binary labels)
            cc_estimate = np.mean(test_predictions)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[:, 1])
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            pcc_calib_estimate = np.mean(calib_pred_probs_df.values[:, 1])
            pcc_calib_rmse = np.sqrt((pcc_calib_estimate - calib_estimate)**2)

            output_df.loc['PCC_cal'] = [n_calib, pcc_calib_estimate, pcc_calib_rmse, np.nan, np.nan, np.nan]
            output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan]
            output_df.loc['PCC'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan]

            # expand the data so as to only have singly-labeled, weighted items
            _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values)

            # do some sort of calibration here (ACC, PACC, PVC)
            print("ACC correction")
            #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values)
            acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
            acc_corrected = calibration.apply_acc_binary(test_predictions, acc)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2)
            output_df.loc['ACC'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

            print("ACC internal")
            acc_corrected = calibration.apply_acc_binary(test_predictions, acc_cfm)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2)
            output_df.loc['ACC_int'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

            print("PVC correction")
            pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
            pvc_corrected = calibration.apply_pvc(test_predictions, pvc)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2)
            output_df.loc['PVC'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

            print("PVC internal")
            pvc_corrected = calibration.apply_pvc(test_predictions, pvc_cfm)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2)
            output_df.loc['PVC_int'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

            print("Venn")
            test_pred_ranges, calib_pred_ranges = ivap.estimate_probs_from_labels(project_dir, model, model_name, subset, subset, sampled_labels_df, calib_items, test_items, weights_df=None)

            if not exclude_calib:
                test_pred_ranges = np.vstack([test_pred_ranges, calib_pred_ranges])

            combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1])

            pred_range = np.mean(test_pred_ranges, axis=0)
            venn_estimate = np.mean(combo)

            venn_rmse = np.sqrt((venn_estimate - test_estimate)**2)
            venn_contains_test = pred_range[0] < test_estimate < pred_range[1]
            output_df.loc['Venn'] = [n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

            output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv')
            output_df.to_csv(output_filename)
Exemple #7
0
def cross_train_and_eval(project_dir, subset, config_file, n_train=500, suffix='', model_type='LR', loss='log', do_ensemble=True, dh=100, label='label', penalty='l1', intercept=True, n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='f1', seed=None, alpha_min=0.01, alpha_max=1000.0, sample_labels=False, run_all=False):

    field_name = 'nosplit'
    model_basename = subset + '_' + label + '_' + field_name + '_' + model_type + '_' + penalty
    if model_type == 'MLP':
        model_basename += '_' + str(dh)
    model_basename += '_' + str(n_train) + '_' + objective
    if sample_labels:
        model_basename += '_sampled'
    model_basename += suffix

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': 'nosplit',
        'config_file': config_file,
        'n_train': n_train,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'do_ensemble': do_ensemble,
        'label': label,
        'penalty': penalty,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'average': average,
        #'use_calib_pred': use_calib_pred,
        #'exclude_calib': exclude_calib,
        'sample_labels': sample_labels
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load all labels
    label_dir = dirs.dir_labels(project_dir, subset)
    labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
    n_items, n_classes = labels_df.shape

    weights_df = None

    # eliminate items with no labels
    print("Subsetting items with labels")
    label_sums_df = labels_df.sum(axis=1)
    labeled_item_selector = label_sums_df > 0
    labels_df = labels_df[labeled_item_selector]
    n_items, n_classes = labels_df.shape
    labeled_items = list(set(labels_df.index))

    print("Starting repeats")
    # repeat the following process multiple times with different random splits of train / calibration / test data
    for r in range(repeats):
        print("* Repetition %d *" % r)
        # take a random subset of the training data
        np.random.shuffle(labeled_items)
        train_items = labeled_items[:n_train]
        test_items = labeled_items[n_train:]
        n_test = len(test_items)
        n_calib = 0

        # create a data frame to hold a summary of the results
        output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test'])
        # create a unique name ofr this model
        model_name = model_basename + '_' + 'nosplit' + '_' + str(r)

        print("Train: %d, calibration: %d, test: %d" % (n_train, n_calib, n_test))
        test_labels_df = labels_df.loc[test_items]

        # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
        if sample_labels:
            print("Sampling labels")
            # normalize the labels
            temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float)
            samples = np.zeros([n_items, n_classes], dtype=int)
            for i in range(n_items):
                index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                samples[i, index] = 1
            sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
        else:
            sampled_labels_df = labels_df

        train_labels_df = sampled_labels_df.loc[train_items].copy()

        # get the true proportion of labels in the test OR non-training data (calibration and test combined)
        target_props, target_estimate, target_std = combo.get_estimate_and_std(labels_df)
        output_df.loc['target'] = [n_test, 'n/a', 'all', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan]

        # get the same estimate from training data
        train_props, train_estimate, train_std = combo.get_estimate_and_std(train_labels_df)
        # compute the error of this estimate
        train_rmse = np.sqrt((train_estimate - target_estimate)**2)
        train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
        output_df.loc['train'] = [n_train, 'train', 'train', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

        # do a test using the number of annotations rather than the number of items
        train_props2, train_estimate2, train_std2 = combo.get_estimate_and_std(train_labels_df, use_n_annotations=True)
        # compute the error of this estimate
        train_rmse2 = np.sqrt((train_estimate2 - target_estimate)**2)
        train_contains_test2 = target_estimate > train_estimate2 - 2 * train_std2 and target_estimate < train_estimate2 + 2 * train_std2
        output_df.loc['train_n_annotations'] = [n_train, 'train', 'train', 'n/a', train_estimate2, train_rmse2, train_estimate2 - 2 * train_std2, train_estimate2 + 2 * train_std2, train_contains_test2]

        print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate))

        if train_estimate > 0.5:
            pos_label = 0
        else:
            pos_label = 1
        print("Using %d as the positive label" % pos_label)

        results_df = pd.DataFrame([], columns=['f1', 'acc', 'calibration', 'calib overall'])

        # Now train a model on the training data, saving the calibration data for calibration
        print("Training model on training data only")
        model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max,  intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose)
        results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall]

        # predict on test data
        test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
        f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
        true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
        test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix())
        test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
        results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall]
        test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

        # predict on calibration and test data combined
        all_predictions_df, all_pred_probs_df, all_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=labeled_items, verbose=verbose)
        all_cc_estimate, all_pcc_estimate, all_acc_estimate_internal, all_pvc_estimate_internal = all_pred_proportions

        cc_rmse = np.sqrt((all_cc_estimate[1] - target_estimate)**2)
        pcc_rmse = np.sqrt((all_pcc_estimate[1] - target_estimate)**2)

        output_df.loc['CC_all'] = [n_items, 'train', 'all', 'predicted', all_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_all'] = [n_items, 'train', 'all', 'predicted', all_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan]

        averaged_cc_estimate = (test_cc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train)
        averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train)

        averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2)
        averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2)

        output_df.loc['CC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan]

        all_acc_rmse_internal = np.sqrt((all_acc_estimate_internal[1] - target_estimate) ** 2)
        all_pvc_rmse_internal = np.sqrt((all_pvc_estimate_internal[1] - target_estimate) ** 2)

        output_df.loc['ACC_internal'] = [n_items, 'train', 'all', 'predicted', all_acc_estimate_internal[1], all_acc_rmse_internal, np.nan, np.nan, np.nan]
        output_df.loc['PVC_internal'] = [n_items, 'train', 'all', 'predicted', all_pvc_estimate_internal[1], all_pvc_rmse_internal, np.nan, np.nan, np.nan]

        print("Venn internal all")
        all_pred_ranges_internal, all_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, labeled_items, plot=False)

        pred_range = np.mean(all_pred_ranges_internal, axis=0)
        venn_estimate = np.mean(all_preds_internal)

        venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
        venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
        output_df.loc['Venn_internal'] = [n_items, 'train', 'all', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

        print("Venn internal test")
        test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

        pred_range = np.mean(test_pred_ranges_internal, axis=0)
        venn_estimate = (np.mean(test_preds_internal) * n_test + train_estimate * n_train) / float(n_test + n_train)
        venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

        averaged_lower = (pred_range[0] * n_test + (train_estimate - 2 * train_std) * n_train) / float(n_test + n_train)
        averaged_upper = (pred_range[1] * n_test + (train_estimate + 2 * train_std) * n_train) / float(n_test + n_train)
        venn_contains_test = averaged_lower < target_estimate < averaged_upper

        output_df.loc['Venn_internal_averaged'] = [n_items, 'train', 'all', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

        results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))
        output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
Exemple #8
0
def cross_train_and_eval(project_dir,
                         subset,
                         field_name,
                         config_file,
                         calib_prop=0.33,
                         nontest_prop=1.0,
                         prefix=None,
                         max_folds=None,
                         model_type='LR',
                         label='label',
                         penalty='l2',
                         cshift=None,
                         intercept=True,
                         n_dev_folds=5,
                         repeats=1,
                         verbose=False,
                         pos_label=1,
                         average='micro',
                         objective='f1'):

    model_basename = subset + '_' + field_name
    if prefix is not None:
        model_basename = prefix + '_' + model_basename

    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'calib_prop': calib_prop,
        'train_prop': nontest_prop,
        'prefix': prefix,
        'max_folds': max_folds,
        'model_type': model_type,
        'label': label,
        'penalty': penalty,
        'cshift': cshift,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'pos_label': pos_label,
        'average': average
    }
    fh.write_to_json(log, logfile)

    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    weights_file = None

    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print(field_vals)

    if max_folds is None:
        max_folds = len(field_vals)

    for v_i, v in enumerate(field_vals[:max_folds]):

        print("\nTesting on %s" % v)
        nontest_selector = metadata[field_name] != v
        nontest_subset = metadata[nontest_selector]
        nontest_items = list(nontest_subset.index)
        n_nontest = len(nontest_items)

        test_selector = metadata[field_name] == v
        test_subset = metadata[test_selector]
        test_items = test_subset.index.tolist()
        n_test = len(test_items)

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'),
                                      index_col=0,
                                      header=0)
        n_items, n_classes = labels_df.shape

        # subsample the non-test items if desired
        if nontest_prop < 1.0:
            np.random.shuffle(nontest_items)
            nontest_items = np.random.choice(nontest_items,
                                             size=int(n_nontest *
                                                      nontest_prop),
                                             replace=False)
            n_nontest = len(nontest_items)

        nontest_labels = labels_df.loc[nontest_items]

        if cshift is not None:
            print("Training a classifier for covariate shift")
            # start by learning to discriminate test from non-test data
            train_test_labels = np.zeros((n_items, 2), dtype=int)
            train_test_labels[nontest_selector, 0] = 1
            train_test_labels[test_selector, 1] = 1
            train_test_labels_df = pd.DataFrame(train_test_labels,
                                                index=labels_df.index,
                                                columns=[0, 1])
            model_name = model_basename + '_' + str(v) + '_' + 'cshift'
            model, dev_f1, dev_cal, _, _ = train.train_model_with_labels(
                project_dir,
                model_type,
                model_name,
                subset,
                train_test_labels_df,
                feature_defs,
                penalty=penalty,
                intercept=intercept,
                n_dev_folds=n_dev_folds,
                verbose=False)

            train_test_pred_df, train_test_probs_df = predict.predict(
                project_dir, model, model_name, subset, label, verbose=verbose)
            print("Min: %0.4f" % train_test_probs_df[1].min())
            print("Max: %0.4f" % train_test_probs_df[1].max())
            # base the weights on the probability of each item being a training item
            weights = n_nontest / float(n_test) * (
                1.0 / train_test_probs_df[0].values - 1)
            print("Min weight: %0.4f" % weights[nontest_selector].min())
            print("Ave weight: %0.4f" % weights[nontest_selector].mean())
            print("Max weight: %0.4f" % weights[nontest_selector].max())
            print("Min weight: %0.4f" % weights.min())
            print("Ave weight: %0.4f" % weights.mean())
            print("Max weight: %0.4f" % weights.max())
            weights_df = pd.DataFrame(weights, index=labels_df.index)
        else:
            weights_df = None

        # repeat the following process multiple times with different random splits of calibration / test data
        for r in range(repeats):
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'estimate', 'RMSE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])

            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # split the non-test items into train and calibration
            n_calib = int(n_nontest * calib_prop)
            np.random.shuffle(nontest_items)
            calib_items = nontest_items[:n_calib]
            train_items = nontest_items[n_calib:]

            train_labels = labels_df.loc[train_items]
            calib_labels = labels_df.loc[calib_items]
            test_labels = labels_df.loc[test_items]

            # get the label proportions from the test and non-test data
            test_props, test_estimate, test_std = get_estimate_and_std(
                test_labels)
            output_df.loc['test'] = [
                n_test, test_estimate, 0, test_estimate - 2 * test_std,
                test_estimate + 2 * test_std, 1
            ]

            nontest_props, nontest_estimate, nontest_std = get_estimate_and_std(
                nontest_labels)
            nontest_rmse = np.sqrt((nontest_estimate - test_estimate)**2)
            nontest_contains_test = test_estimate > nontest_estimate - 2 * nontest_std and test_estimate < nontest_estimate + 2 * nontest_std
            output_df.loc['nontest'] = [
                n_nontest, nontest_estimate, nontest_rmse,
                nontest_estimate - 2 * nontest_std,
                nontest_estimate + 2 * nontest_std, nontest_contains_test
            ]

            # train a model
            print("Doing training")
            model, dev_f1, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(
                project_dir,
                model_type,
                model_name,
                subset,
                labels_df,
                feature_defs,
                weights_df=weights_df,
                items_to_use=train_items,
                penalty=penalty,
                intercept=intercept,
                objective=objective,
                n_dev_folds=n_dev_folds,
                verbose=verbose)

            # predict on the calibration and test sets
            print("Doing prediction on calibration items")
            calib_predictions, calib_pred_probs = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=calib_items,
                verbose=verbose)

            print("Doing prediction on test items")
            test_predictions, test_pred_probs = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose)

            # evaluate the model on the calibration and test data
            print("Doing evaluation")
            f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(
                calib_labels,
                calib_predictions,
                pos_label=pos_label,
                average=average)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels,
                test_predictions,
                pos_label=pos_label,
                average=average)
            results_df = pd.DataFrame([], columns=['f1', 'acc'])
            results_df.loc['calibration'] = [f1_cal, acc_cal]
            results_df.loc['test'] = [f1_test, acc_test]
            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))

            # first check results without any correction
            # average the preditions (assuming binary labels)
            cc_estimate = np.mean(test_predictions[label].values)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[1].values)
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, 0, 1, np.nan]
            output_df.loc['PCC'] = [
                n_test, pcc_estimate, pcc_rmse, 0, 1, np.nan
            ]

            # do the two basic corrections, based on the calibration data
            print("ACC internal")
            calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(
                calib_labels.values, calib_predictions.values)
            acc = calibration.compute_acc(calib_labels_expanded,
                                          calib_predictions_expanded,
                                          n_classes, calib_weights_expanded)
            acc_corrected = calibration.apply_acc_binary(
                test_predictions.values, acc)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate)**2)
            output_df.loc['ACC_int'] = [
                n_calib, acc_estimate, acc_rmse, 0, 1, np.nan
            ]

            print("PVC internal")
            pvc = calibration.compute_pvc(calib_labels_expanded,
                                          calib_predictions_expanded,
                                          n_classes,
                                          weights=calib_weights_expanded)
            pvc_corrected = calibration.apply_pvc(test_predictions.values, pvc)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate)**2)
            output_df.loc['PVC_int'] = [
                n_calib, pvc_estimate, pvc_rmse, 0, 1, np.nan
            ]

            # do IVAP for calibration
            print("Venn")
            test_pred_ranges = ivap.estimate_probs_from_labels(
                project_dir,
                model,
                model_name,
                subset,
                subset,
                labels_df,
                calib_items,
                test_items,
                weights_df=weights_df)
            combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] +
                                              test_pred_ranges[:, 1])

            pred_range = np.mean(test_pred_ranges, axis=0)
            venn_estimate = np.mean(combo)
            venn_rmse = np.sqrt((venn_estimate - test_estimate)**2)
            venn_contains_test = pred_range[0] < test_estimate < pred_range[1]
            output_df.loc['Venn'] = [
                n_calib, venn_estimate, venn_rmse, pred_range[0],
                pred_range[1], venn_contains_test
            ]

            output_filename = os.path.join(dirs.dir_models(project_dir),
                                           model_name,
                                           field_name + '_' + str(v) + '.csv')
            output_df.to_csv(output_filename)