Example #1
0
def find_entities(n_files=None, use_lemmas=False):

    parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed')
    parsed_files = glob.glob(os.path.join(parsed_dir, '*.json'))

    dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json')
    dependencies = fh.read_json(dependencies_file)

    coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json')
    coref_heads = fh.read_json(coref_file)

    supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json'))

    trees = {}
    clustered_indices = {}
    print "Building trees and finding story elements"
    if n_files is None:
        n_files = len(parsed_files)
    else:
        n_files = int(n_files)
    for f_i, f in enumerate(parsed_files[:n_files]):
        sentences = fh.read_json(f)
        basename = fh.get_basename_wo_ext(f)
        trees[f] = build_tree(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename, use_lemmas)
        clustered_indices[f] = find_entities_in_article(trees[f])
        if f_i % 1000 == 0 and f_i > 0:
            print f_i

    return trees, clustered_indices
def write_sentences(f):
    output_dir = fh.makedirs(dirs.data_semafor_dir, 'temp')

    index = 0
    sent_index = {}
    responses = fh.read_json(f)
    keys = responses.keys()
    keys.sort()

    #all_items = ds.get_all_documents()
    #unlabeled = list(set(keys) - all_items)
    #print len(unlabeled)

    for k in keys:
        sentence_filename = os.path.join(output_dir, k + '.txt')
        #index_filename = fh.make_filename(output_dir, fh.get_basename(f), 'json')
        with codecs.open(sentence_filename, 'w', encoding='utf-8') as output_file:
            text = responses[k]
            paragraphs = text.split('\n\n')
            paragraphs = [p for p in paragraphs if p != '']
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p)
                for sent in sentences:
                    sent = sent.lstrip()
                    sent = sent.rstrip()
                    if len(sent) > 0:
                        output_file.write(sent + '\n')
Example #3
0
def preprocess_for_brown_clustering():

    input_filename = dirs.data_processed_text_file
    articles = fh.read_json(input_filename)
    keys = articles.keys()
    keys.sort()

    items = keys

    print len(items)

    processed_dict = {}
    output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'input', 'txt')

    with codecs.open(output_filename, 'w', encoding='utf-8') as output_file:
        for k in keys:
            text = articles[k]
            tokens = []
            sentences = text.split('\n')
            for s in sentences:
                sent_tokens = tokenizer.split_into_words(s, reattach=False, split_off_quotes=False,
                                                         lemmatize=False, replace_numbers=True)
                tokens = tokens + sent_tokens
            if k in items:
                output_file.write(' '.join(tokens) + '\n')
            processed_dict[k] = tokens

    output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'processed', 'json')
    fh.write_to_json(processed_dict, output_filename)
Example #4
0
def split_into_files(input_filename, output_dir):
    data = fh.read_json(input_filename)

    keys = data.keys()
    keys.sort()
    filelist = []

    for key in keys:
        key = key.rstrip('\n')
        line = data[key].rstrip('\n')
        normalized_filename = os.path.join(output_dir, key + '.txt')
        filelist.append(normalized_filename)
        with codecs.open(normalized_filename, 'w', encoding='utf-8') as output_file:
            output_file.write(line)

    filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt')
    fh.write_list_to_text(filelist, filelist_filename)
    return filelist_filename
Example #5
0
def write_tagged_text(parsed_filename, output_filename):
    data = fh.read_json(parsed_filename)

    tagged_text = {}
    for key, sentences in data.items():
        tagged_sentences = []
        for sentence in sentences:
            tagged_tokens = []
            for token in sentence:
                word = token.get('word', '__MISSING__')
                POS = token.get('POS', '__MISSING__')
                lemma = token.get('lemma', '__MISSING__')
                NER = token.get('NER', '__MISSING__')
                #tagged = word + '_' + POS
                tagged = POS + '_POS_'
                tagged_tokens.append(tagged)
            tagged_sentence = ' '.join(tagged_tokens)
            tagged_sentences.append(tagged_sentence)
        tagged_text[fh.get_basename_wo_ext(key)] = ' '.join(tagged_sentences)

    fh.write_to_json(tagged_text, output_filename, sort_keys=False)
def preprocess_for_easysrl():

    input_filename = dirs.data_processed_text_file
    articles = fh.read_json(input_filename)
    keys = articles.keys()
    keys.sort()

    labeled = list(ds.get_all_documents())
    labeled.sort()

    processed_dict = {}
    output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt')

    with codecs.open(output_filename, 'w', encoding='utf-8') as output_file:
        count = 0
        for k in labeled:
            output_file.write(k + ' starts here\n')
            text = articles[k]
            paragraphs = text.split('\n\n')
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p.strip())
                for s in sentences:
                    output_file.write(s.strip() + '\n')
Example #7
0
def test_over_time(project_dir, subset, config_file, model_type, field, train_start, train_end, test_start, test_end, n_train=None, n_calib=0, penalty='l2', suffix='', loss='log', objective='f1', do_ensemble=True, dh=300, label='label', intercept=True, n_dev_folds=5, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, nonlinearity='tanh', init_lr=1e-2, min_epochs=2, max_epochs=50, patience=5, tol=1e-4, list_size=1, repeats=1, oracle=False, lower=None, interactive=False, stoplist_file=None, cshift=False, n_cshift=None, do_cfm=True, do_platt=True, dropout=0.0, min_test=None, test_prop=None, verbose=False):
    # Just run a regular model, one per year, training on the past, and save the reults

    if seed is not None:
        seed = int(seed)
        np.random.seed(seed)

    log = {
        'project': project_dir,
        'subset': subset,
        'config_file': config_file,
        'model_type': model_type,
        'field': field,
        'train_start': train_start,
        'train_end': train_end,
        'test_start': test_start,
        'test_end': test_end,
        'n_train': n_train,
        'n_calib': n_calib,
        'penalty': penalty,
        'cshift': cshift,
        'n_cshift': n_cshift,
        'suffix': suffix,
        'loss': loss,
        'objective': objective,
        'do_ensemble': do_ensemble,
        'dh': dh,
        'label': label,
        'intercept': intercept,
        'n_dev_folds': n_dev_folds,
        'average': average,
        'seed': seed,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'n_alphas': n_alphas,
        'sample_labels': sample_labels,
        'group_identical': group_identical,
        'annotated_subset': annotated_subset,
        'nonlinearity': nonlinearity,
        'init_lr': init_lr,
        'min_epochs': min_epochs,
        'max_epochs': max_epochs,
        'patience': patience,
        'tol': tol,
        'interactive': interactive,
        'stoplist_file': stoplist_file,
        'list_size': list_size
    }

    model_basename = make_model_basename(log)

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))

    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field].values))
    field_vals.sort()
    print("Splitting data according to %s", field)
    print("Values:", field_vals)

    print("\nTesting on %s to %s" % (test_start, test_end))

    # first, split into training and non-train data based on the field of interest
    all_items = list(metadata.index)
    test_selector_all = (metadata[field] >= int(test_start)) & (metadata[field] <= int(test_end))
    test_subset_all = metadata[test_selector_all]
    test_items_all = test_subset_all.index.tolist()
    n_test_all = len(test_items_all)

    if min_test is not None:
        if n_test_all < min_test:
            print("Not enough test samples; exiting")
            return

    if train_end is None:
        if train_start is None:
            train_selector_all = metadata[field] < int(test_start)
        else:
            train_selector_all = (metadata[field] < int(test_start)) & (metadata[field] >= train_start)
    else:
        if train_start is None:
            train_selector_all = metadata[field] <= int(train_end)
        else:
            train_selector_all = (metadata[field] <= int(train_end)) & (metadata[field] >= train_start)

    train_subset_all = metadata[train_selector_all]
    train_items_all = list(train_subset_all.index)
    n_train_all = len(train_items_all)
    # only keep the items in the train and test sets
    all_items = train_items_all + test_items_all

    print("Train: %d, Test: %d (labeled and unlabeled)" % (n_train_all, n_test_all))

    # load all labels
    label_dir = dirs.dir_labels(project_dir, subset)
    labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
    labels_df = labels_df.loc[all_items]

    # if desired, attempt to learn weights for the training data using techniques for covariate shift
    if cshift:
        print("Training a classifier for covariate shift")
        # start by learning to discriminate train from non-train data
        # Label items based on whether they come from train or test
        train_test_labels = np.zeros((len(all_items), 2), dtype=int)
        train_test_labels[:n_train_all, 0] = 1
        train_test_labels[n_train_all:, 1] = 1
        if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]):
            cshift_pos_label = 0
        else:
            cshift_pos_label = 1
        train_test_labels_df = pd.DataFrame(train_test_labels, index=all_items, columns=[0, 1])

        if n_cshift is not None and len(all_items) >= n_cshift:
            print("Taking a random sample of %d items for reweighting" % n_cshift)
            #np.random.shuffle(all_items)
            cshift_items = np.random.choice(all_items, size=n_cshift, replace=False)
        else:
            print("Using all train items")
            cshift_items = all_items

        print(train_test_labels_df.loc[cshift_items].mean(axis=0))

        # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.)
        model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + 'cshift'
        model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, items_to_use=cshift_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=False, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False)
        print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc))

        #X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items)
        X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items)
        cshift_pred_probs = model.predict_probs(X_cshift)
        f_items = features_concat.get_items()
        assert len(f_items) == len(all_items)
        for i in range(len(all_items)):
            assert all_items[i] == f_items[i]
        cshift_pred_probs_df = pd.DataFrame(cshift_pred_probs, index=features_concat.get_items(), columns=range(2))

        # display the min and max probs
        print("Min: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].min())
        print("Mean: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].mean())
        print("Max: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].max())
        # HACK: need to prevent 0s in prob(y=0|x)
        p_train_values = cshift_pred_probs_df[0].values
        threshold = 0.01
        p_train_values[p_train_values < threshold] = threshold
        print("After thresholding")
        print("Min: %0.6f" % p_train_values[:n_train_all].min())
        print("Mean: %0.6f" % p_train_values[:n_train_all].mean())
        print("Max: %0.6f" % p_train_values[:n_train_all].max())

        # use the estimated probability of each item being a training item to compute item weights
        weights = n_train_all / float(n_test_all) * (1.0/p_train_values - 1)
        weights_df_all = pd.DataFrame(weights, index=all_items)
        # print a summary of the weights from just the training items
        print("Min weight: %0.4f" % weights[:n_train_all].min())
        print("Ave weight: %0.4f" % weights[:n_train_all].mean())
        print("Max weight: %0.4f" % weights[:n_train_all].max())
        # print a summary of all weights
        #print("Min weight: %0.4f" % weights.min())
        #print("Ave weight: %0.4f" % weights.mean())
        #print("Max weight: %0.4f" % weights.max())
        # create a data frame with this information
    else:
        weights_df_all = None

    # find the labeled items
    print("Subsetting items with labels")
    label_sums_df = labels_df.sum(axis=1)
    labeled_item_selector = label_sums_df > 0
    labels_df = labels_df[labeled_item_selector]
    n_labeled_items, n_classes = labels_df.shape
    print("%d labeled items" % n_labeled_items)
    labeled_items = set(labels_df.index)

    train_items_labeled = [i for i in train_items_all if i in labeled_items]

    test_items = [i for i in test_items_all if i in labeled_items]
    #n_train = len(train_items)
    n_test = len(test_items)

    for r in range(repeats):

        # set seed very explicily here to make sure experiments are comparable
        if seed is not None:
            seed += 1
            np.random.seed(seed)

        print("* Starting repetition %d *" % r)
        model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + '_' + str(r)
        if n_train is not None and len(train_items_labeled) >= n_train:
            np.random.shuffle(train_items_labeled)
            train_items = np.random.choice(train_items_labeled, size=n_train, replace=False)
        else:
            print("Using all train items")
            train_items = train_items_labeled
        n_train_r = len(train_items)

        # now, choose a calibration set
        if n_calib > 0 and n_test >= n_calib:
            np.random.shuffle(test_items)
            calib_items = np.random.choice(test_items, size=n_calib, replace=False)
        elif n_test < n_calib:
            print("Error: Only %d labeled test instances available" % n_test)
            calib_items = test_items
        else:
            calib_items = []

        if weights_df_all is not None:
            weights_df = weights_df_all[labeled_item_selector]
        else:
            weights_df = None

        print("Labeled train: %d, test: %d" % (n_train_r, n_test))

        # create a data frame to hold a summary of the results
        output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test'])

        test_labels_df = labels_df.loc[test_items]
        # do a fake adjustment of the test label proportions
        if test_prop is not None:
            test_prop = float(test_prop)
            test_label_values = test_labels_df.values
            test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0])
            order = list(np.argsort(test_label_props))

            true_prop = np.mean(test_label_props)
            if test_prop < true_prop:
                i = 0
                running = test_label_props[order[i]]
                new_test_items = [test_items[order[i]]]
                i += 1
                while (running / i) <= test_prop:
                    running += test_label_props[order[i]]
                    new_test_items.append(test_items[order[i]])
                    i += 1
                print("Taking %d test_items" % len(new_test_items))
                test_items = new_test_items[:]
            else:
                order.reverse()
                i = 0
                running = test_label_props[order[i]]
                new_test_items = [test_items[order[i]]]
                i += 1
                while (running / i) >= test_prop:
                    running += test_label_props[order[i]]
                    new_test_items.append(test_items[order[i]])
                    i += 1
                print("Taking %d test_items" % len(new_test_items))
                test_items = new_test_items[:]

            test_labels_df = labels_df.loc[test_items]
            test_label_values = test_labels_df.values
            test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0])
            print("New props = %0.3f" % np.mean(test_label_props))

        # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
        if sample_labels:
            print("Sampling labels")
            # normalize the labels
            temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_labeled_items, 1)), dtype=float)
            samples = np.zeros([n_labeled_items, n_classes], dtype=int)
            for i in range(n_labeled_items):
                index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                samples[i, index] = 1
            sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
        else:
            sampled_labels_df = labels_df

        train_labels_df = sampled_labels_df.loc[train_items].copy()
        if n_calib > 0:
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()
        else:
            calib_labels_df = None

        # get the true proportion of labels in the test OR non-training data (calibration and test combined)
        target_props, target_estimate, target_std = get_estimate_and_std(test_labels_df, use_n_annotations=True)
        output_df.loc['target'] = [n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan]

        # get the same estimate from training data
        train_props, train_estimate, train_std = get_estimate_and_std(train_labels_df, use_n_annotations=True)
        print("Train props:", train_props, train_estimate)
        train_rmse = np.abs(train_estimate - target_estimate)
        train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
        output_df.loc['train'] = [n_train_r, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

        # get the same estimate from training data
        if n_calib > 0:
            calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df, use_n_annotations=True)
            # compute the error of this estimate
            calib_rmse = np.abs(calib_estimate - target_estimate)
            calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and target_estimate < calib_estimate + 2 * calib_std
            output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test]
        else:
            calib_estimate = 0.0
            calib_std = 1.0
            output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', np.nan, np.nan, np.nan, np.nan, np.nan]

        if train_estimate > 0.5:
            pos_label = 0
        else:
            pos_label = 1
        print("Using %d as the positive label" % pos_label)

        results_df = pd.DataFrame([], columns=['f1', 'acc', 'mae', 'estimated calibration'])

        # Now train a model on the training data, saving the calibration data for calibration

        if stoplist_file is not None:
            stoplist = fh.read_text(stoplist_file)
            stoplist = {s.strip() for s in stoplist}
            print(stoplist)
        else:
            stoplist = None

        print("Training a LR model")
        model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(project_dir, model_type, 'log', model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=None, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, do_cfm=do_cfm, do_platt=do_platt, lower=lower, stoplist=stoplist, dropout=dropout, verbose=verbose)
        results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal_mae, dev_cal_est]

        X_test, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=test_items)
        test_predictions = model.predict(X_test)
        test_predictions_df = pd.DataFrame(test_predictions, index=features_concat.get_items(), columns=[label])
        test_pred_probs = model.predict_probs(X_test)
        _, n_labels = test_pred_probs.shape
        test_pred_probs_df = pd.DataFrame(test_pred_probs, index=features_concat.get_items(), columns=range(n_labels))

        f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
        true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
        test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
        test_cc_estimate, test_pcc_estimate = model.predict_proportions(X_test)

        test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate))
        test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate))

        results_df.loc['test'] = [f1_test, acc_test, test_pcc_mae, test_cal_est]

        output_df.loc['CC'] = [n_train_r, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan]
        output_df.loc['PCC'] = [n_train_r, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan]

        test_acc_estimate_internal, test_acc_ms_estimate_internal = model.predict_proportions(X_test, do_cfm=do_cfm)

        test_acc_rmse_internal = np.abs(test_acc_estimate_internal[1] - target_estimate)
        test_acc_ms_rmse_internal = np.abs(test_acc_ms_estimate_internal[1] - target_estimate)

        output_df.loc['ACC_internal'] = [n_train_r, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan]
        output_df.loc['MS_internal'] = [n_train_r, 'train', 'nontrain', 'predicted', test_acc_ms_estimate_internal[1], test_acc_ms_rmse_internal, np.nan, np.nan, np.nan]

        test_platt1_estimate, test_platt2_estimate = model.predict_proportions(X_test, do_platt=do_platt)

        test_platt1_rmse = np.abs(test_platt1_estimate[1] - target_estimate)
        test_platt2_rmse = np.abs(test_platt2_estimate[1] - target_estimate)

        output_df.loc['PCC_platt1'] = [n_train_r, 'train', 'test', 'n/a', test_platt1_estimate[1], test_platt1_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_platt2'] = [n_train_r, 'train', 'nontrain', 'predicted', test_platt2_estimate[1], test_platt2_rmse, np.nan, np.nan, np.nan]

        if n_calib > 0:
            cc_plus_cal_estimate = (test_cc_estimate[1] + calib_estimate) / 2.0
            pcc_plus_cal_estimate = (test_pcc_estimate[1] + calib_estimate) / 2.0
            cc_plus_cal_mae = np.mean(np.abs(cc_plus_cal_estimate - target_estimate))
            pcc_plus_cal_mae = np.mean(np.abs(pcc_plus_cal_estimate - target_estimate))

            #output_df.loc['CC_plus_cal'] = [n_train, 'train', 'test', 'n/a', cc_plus_cal_estimate, cc_plus_cal_mae, np.nan, np.nan, np.nan]
            output_df.loc['PCC_plus_cal'] = [n_train_r, 'train', 'test', 'n/a', pcc_plus_cal_estimate, pcc_plus_cal_mae, np.nan, np.nan, np.nan]

        results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))
        output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))

        """
Example #8
0
def cross_train_and_eval(project_dir,
                         reference_model_dir,
                         subset,
                         field_name,
                         config_file,
                         n_train=100,
                         field_val=None,
                         vocab_file=None,
                         group_identical=False,
                         suffix='',
                         model_type='MLP',
                         loss='log',
                         do_ensemble=True,
                         dh=100,
                         label='label',
                         n_dev_folds=5,
                         repeats=1,
                         verbose=False,
                         average='micro',
                         objective='calibration',
                         seed=None,
                         init_lr=1e-4,
                         min_epochs=2,
                         max_epochs=50,
                         early_stopping=False,
                         tol=1e-4,
                         patience=8):
    n_calib = 0
    model_basename = subset + '_' + label + '_' + field_name + '_' + model_type
    if model_type == 'MLP':
        model_basename += '_' + str(dh)
    model_basename += '_' + str(n_train) + '_' + str(n_calib) + '_' + objective
    model_basename += suffix

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'n_calib': n_calib,
        'n_train': n_train,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'do_ensemble': do_ensemble,
        'label': label,
        'field_val': field_val,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'average': average,
        'objective': objective,
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print("Splitting data according to :", field_vals)
    print(field_vals)

    if field_val is not None:
        field_vals = [field_val]

    # repeat the following value for each fold of the partition of interest (up to max_folds, if given)
    for v_i, v in enumerate(field_vals):
        print("\nTesting on %s" % v)
        # first, split into training and non-train data based on the field of interest
        train_selector = metadata[field_name] != v
        train_subset = metadata[train_selector]
        train_items = list(train_subset.index)
        n_train_cshift = len(train_items)

        non_train_selector = metadata[field_name] == v
        non_train_subset = metadata[non_train_selector]
        non_train_items = non_train_subset.index.tolist()
        n_non_train_cshift = len(non_train_items)

        print("Train: %d, non-train: %d" %
              (n_train_cshift, n_non_train_cshift))

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'),
                                      index_col=0,
                                      header=0)
        n_items, n_classes = labels_df.shape

        weights_df = None

        # add in a stage to eliminate items with no labels?
        print("Subsetting items with labels")
        label_sums_df = labels_df.sum(axis=1)
        labeled_item_selector = label_sums_df > 0
        labels_df = labels_df[labeled_item_selector]
        n_items, n_classes = labels_df.shape
        labeled_items = set(labels_df.index)

        train_items = [i for i in train_items if i in labeled_items]
        non_train_items = [i for i in non_train_items if i in labeled_items]
        n_non_train = len(non_train_items)

        if weights_df is not None:
            weights_df = weights_df[labeled_item_selector]

        print("Starting repeats")
        # repeat the following process multiple times with different random splits of train / calibration / test data
        for r in range(repeats):
            print("* Repetition %d *" % r)
            # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items
            if n_train > 0:
                np.random.shuffle(train_items)
                train_items_r = np.random.choice(train_items,
                                                 size=n_train,
                                                 replace=False)
            else:
                train_items_r = train_items

            n_train_r = len(train_items_r)

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'training data', 'test data',
                                         'cal', 'estimate', 'RMSE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])
            # create a unique name ofr this model
            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # now, divide the non-train data into a calibration and a test set
            #n_calib = int(calib_prop * n_non_train)
            np.random.shuffle(non_train_items)
            if n_calib > n_non_train:
                n_calib = int(n_non_train / 2)
                print(
                    "Warning!!: only %d non-train items; using 1/2 for calibration"
                    % n_non_train)

            calib_items = non_train_items[:n_calib]
            test_items = non_train_items[n_calib:]
            n_test = len(test_items)

            print("Train: %d, calibration: %d, test: %d" %
                  (n_train_r, n_calib, n_test))
            test_labels_df = labels_df.loc[test_items]
            non_train_labels_df = labels_df.loc[non_train_items]

            sampled_labels_df = labels_df

            train_labels_r_df = sampled_labels_df.loc[train_items_r].copy()
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            target_props, target_estimate, target_std = get_estimate_and_std(
                non_train_labels_df)
            output_df.loc['target'] = [
                n_test, 'nontrain', 'nontrain', 'given', target_estimate, 0,
                target_estimate - 2 * target_std,
                target_estimate + 2 * target_std, np.nan
            ]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(
                train_labels_r_df)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - target_estimate)**2)
            train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [
                n_train_r, 'train', 'train', 'n/a', train_estimate, train_rmse,
                np.nan, np.nan, np.nan
            ]

            print(
                "target proportions: (%0.3f, %0.3f); train proportions: %0.3f"
                % (target_estimate - 2 * target_std,
                   target_estimate + 2 * target_std, train_estimate))

            if train_estimate > 0.5:
                pos_label = 0
            else:
                pos_label = 1
            print("Using %d as the positive label" % pos_label)

            # repeat for labeled calibration data
            if n_calib > 0:
                calib_props, calib_estimate, calib_std = get_estimate_and_std(
                    calib_labels_df)
                calib_rmse = np.sqrt((calib_estimate - target_estimate)**2)
                # check if the test estimate is within 2 standard deviations of the estimate
                calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std
                output_df.loc['calibration'] = [
                    n_calib, 'calibration', 'nontrain', 'given',
                    calib_estimate, calib_rmse, calib_estimate - 2 * calib_std,
                    calib_estimate + 2 * calib_std, calib_contains_test
                ]

                # do a test using the number of annotations rather than the number of items
                calib_props2, calib_estimate2, calib_std2 = get_estimate_and_std(
                    calib_labels_df, use_n_annotations=True)
                calib_rmse2 = np.sqrt((calib_estimate2 - target_estimate)**2)
                calib_contains_test2 = target_estimate > calib_estimate2 - 2 * calib_std2 and calib_estimate < calib_estimate2 + 2 * calib_std2
                output_df.loc['calibration_n_annotations'] = [
                    n_calib, 'calibration', 'nontrain', 'given',
                    calib_estimate2, calib_rmse2,
                    calib_estimate2 - 2 * calib_std2,
                    calib_estimate2 + 2 * calib_std2, calib_contains_test2
                ]

            results_df = pd.DataFrame(
                [], columns=['f1', 'acc', 'calibration', 'calib overall'])

            # Now train a model on the training data, saving the calibration data for calibration

            print("Training model on training data only")
            model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_brier_grouped(
                project_dir,
                model_name,
                subset,
                sampled_labels_df,
                feature_defs,
                weights_df=weights_df,
                vocab_file=vocab_file,
                group_identical=group_identical,
                items_to_use=train_items_r,
                intercept=True,
                n_dev_folds=n_dev_folds,
                do_ensemble=do_ensemble,
                dh=dh,
                seed=seed,
                pos_label=pos_label,
                verbose=verbose,
                init_lr=init_lr,
                min_epochs=min_epochs,
                max_epochs=max_epochs,
                early_stopping=early_stopping,
                tol=tol,
                patience=patience)
            results_df.loc['cross_val'] = [
                dev_f1, dev_acc, dev_cal, dev_cal_overall
            ]

            # predict on calibration data
            if n_calib > 0:
                calib_predictions_df, calib_pred_probs_df, calib_pred_proportions = predict.predict(
                    project_dir,
                    model,
                    model_name,
                    subset,
                    label,
                    items_to_use=calib_items,
                    verbose=verbose,
                    force_dense=True)
                calib_cc, calib_pcc, calib_acc, calib_pvc = calib_pred_proportions
                f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(
                    calib_labels_df,
                    calib_predictions_df,
                    calib_pred_probs_df,
                    pos_label=pos_label,
                    average=average,
                    verbose=False)
                true_calib_vector = np.argmax(calib_labels_df.as_matrix(),
                                              axis=1)
                calib_cal_rmse = evaluation.evaluate_calibration_rmse(
                    true_calib_vector, calib_pred_probs_df.as_matrix())
                calib_cal_rmse_overall = evaluation.evaluate_calibration_rmse(
                    true_calib_vector,
                    calib_pred_probs_df.as_matrix(),
                    min_bins=1,
                    max_bins=1)
                results_df.loc['calibration'] = [
                    f1_cal, acc_cal, calib_cal_rmse, calib_cal_rmse_overall
                ]

            # predict on test data
            test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose,
                force_dense=True)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels_df,
                test_predictions_df,
                test_pred_probs_df,
                pos_label=pos_label,
                average=average)
            true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
            test_cal_rmse = evaluation.evaluate_calibration_rmse(
                true_test_vector, test_pred_probs_df.as_matrix())
            test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(
                true_test_vector,
                test_pred_probs_df.as_matrix(),
                min_bins=1,
                max_bins=1)
            results_df.loc['test'] = [
                f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall
            ]
            test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

            # predict on calibration and test data combined
            nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=non_train_items,
                verbose=verbose,
                force_dense=True)
            nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions

            if n_calib > 0:
                cc_calib_rmse = np.sqrt((calib_cc[1] - calib_estimate)**2)
                output_df.loc['CC_cal'] = [
                    n_non_train, 'train', 'calibration', 'predicted',
                    calib_cc[1], cc_calib_rmse, np.nan, np.nan, np.nan
                ]

                pcc_calib_rmse = np.sqrt((calib_pcc[1] - calib_estimate)**2)
                output_df.loc['PCC_cal'] = [
                    n_non_train, 'train', 'calibration', 'predicted',
                    calib_pcc[1], pcc_calib_rmse, np.nan, np.nan, np.nan
                ]

            cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2)
            pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2)

            output_df.loc['CC_nontrain'] = [
                n_non_train, 'train', 'nontrain', 'predicted',
                nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan
            ]
            output_df.loc['PCC_nontrain'] = [
                n_non_train, 'train', 'nontrain', 'predicted',
                nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan
            ]

            if n_calib > 0:
                averaged_cc_estimate = (
                    test_cc_estimate[1] * n_test +
                    calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pcc_estimate = (
                    test_pcc_estimate[1] * n_test +
                    calib_estimate * n_calib) / float(n_test + n_calib)

                averaged_cc_rmse = np.sqrt(
                    (averaged_cc_estimate - target_estimate)**2)
                averaged_pcc_rmse = np.sqrt(
                    (averaged_pcc_estimate - target_estimate)**2)

                output_df.loc['CC_nontrain_averaged'] = [
                    n_non_train, 'train', 'nontrain', 'given',
                    averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan,
                    np.nan
                ]
                output_df.loc['PCC_nontrain_averaged'] = [
                    n_non_train, 'train', 'nontrain', 'given',
                    averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan,
                    np.nan
                ]
            """
            nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2)
            nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2)

            output_df.loc['ACC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan]
            output_df.loc['PVC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan]

            if n_calib > 0:
                averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2)
                averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2)

                output_df.loc['ACC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan]
                output_df.loc['PVC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan]

            # do calibration here using calibration data
            if n_calib > 0:
                # expand the data so as to only have singly-labeled, weighted items
                _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values)

                #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values)
                acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
                acc_corrected = calibration.apply_acc_binary(nontrain_predictions_df.values, acc)
                acc_estimate = acc_corrected[1]
                acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2)
                output_df.loc['ACC'] = [n_non_train, 'train', 'nontrain', 'predicted', acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

                pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
                pvc_corrected = calibration.apply_pvc(nontrain_predictions_df.values, pvc)
                pvc_estimate = pvc_corrected[1]
                pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2)
                output_df.loc['PVC'] = [n_non_train, 'train', 'nontrain', 'predicted', pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

                acc_corrected = calibration.apply_acc_binary(test_predictions_df.values, acc)
                acc_estimate = acc_corrected[1]
                averaged_acc_estimate = (acc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2)
                output_df.loc['ACC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate, averaged_acc_rmse, np.nan, np.nan, np.nan]

                pvc_corrected = calibration.apply_pvc(test_predictions_df.values, pvc)
                pvc_estimate = pvc_corrected[1]
                averaged_pvc_estimate = (pvc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2)
                output_df.loc['PVC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate, averaged_pvc_rmse, np.nan, np.nan, np.nan]

            print("Venn internal nontrain")
            #models = list(model._models.values())
            nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items)

            pred_range = np.mean(nontrain_pred_ranges_internal, axis=0)
            venn_estimate = np.mean(nontrain_preds_internal)

            venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
            venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
            output_df.loc['Venn_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

            if n_calib > 0:
                print("Venn internal test")
                test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

                pred_range = np.mean(test_pred_ranges_internal, axis=0)
                venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

                averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                venn_contains_test = averaged_lower < target_estimate < averaged_upper

                output_df.loc['Venn_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

                # Venn prediction using proper calibration data
                print("Venn calibration")
                calib_pred_ranges, calib_preds, calib_props_in_range, list_of_n_levels = ivap.estimate_probs_from_labels_cv(project_dir, model, model_name, sampled_labels_df, subset, calib_items=calib_items)
                print("Venn test")
                test_pred_ranges, test_preds = ivap.estimate_probs_from_labels(project_dir, model, model_name, sampled_labels_df, subset, subset, calib_items=calib_items, test_items=test_items)

                nontrain_pred_ranges = np.vstack([calib_pred_ranges, test_pred_ranges])
                nontrain_preds = np.r_[calib_preds, test_preds]

                nontrain_pred_range = np.mean(nontrain_pred_ranges, axis=0)
                nontrain_venn_estimate = np.mean(nontrain_preds)
                nontrain_venn_rmse = np.sqrt((nontrain_venn_estimate - target_estimate)**2)
                nontrain_contains_test = nontrain_pred_range[0] < target_estimate < nontrain_pred_range[1]
                output_df.loc['Venn'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_venn_estimate, nontrain_venn_rmse, nontrain_pred_range[0], nontrain_pred_range[1], nontrain_contains_test]

                test_pred_range = np.mean(test_pred_ranges, axis=0)
                averaged_venn_estimate = (np.mean(test_preds) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                averaged_venn_rmse = np.sqrt((averaged_venn_estimate - target_estimate)**2)

                averaged_lower = (test_pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                averaged_upper = (test_pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                venn_contains_test = averaged_lower < target_estimate < averaged_upper

                output_df.loc['Venn_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_venn_estimate, averaged_venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

                fh.write_list_to_text(calib_props_in_range, os.path.join(dirs.dir_models(project_dir), model_name, 'venn_calib_props_in_range.csv'))
                fh.write_list_to_text(list_of_n_levels, os.path.join(dirs.dir_models(project_dir), model_name, 'list_of_n_levels.csv'))
                results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))

            # now train a model on the training and calibration data combined
            if run_all:
                print("Training model on all labeled data")
                calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r))
                model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose)
                results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall]

                # get labels for test data
                test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
                f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
                test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions
                true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
                test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix())
                results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, 0]
                results_df.loc['test_all'] = [f1_test, acc_test, test_cal_rmse, 0]

                nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose)
                nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions

                cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2)
                pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2)

                output_df.loc['CC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan]
                output_df.loc['PCC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan]

                if n_calib > 0:
                    averaged_cc_estimate = (test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)

                    averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2)
                    averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2)

                    output_df.loc['CC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan]
                    output_df.loc['PCC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan]

                nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2)
                nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2)

                output_df.loc['ACC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan]
                output_df.loc['PVC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan]

                if n_calib > 0:
                    averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2)
                    averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2)

                    output_df.loc['ACC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan]
                    output_df.loc['PVC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan]

                print("Venn internal nontrain")
                nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items)

                pred_range = np.mean(nontrain_pred_ranges_internal, axis=0)
                venn_estimate = np.mean(nontrain_preds_internal)

                venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
                venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
                output_df.loc['Venn_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

                if n_calib > 0:
                    print("Venn internal test")
                    test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

                    pred_range = np.mean(test_pred_ranges_internal, axis=0)
                    venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib)
                    venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

                    averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib)
                    averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib)
                    venn_contains_test = averaged_lower < target_estimate < averaged_upper

                    output_df.loc['Venn_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

            """
            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'accuracy.csv'))
            output_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))
Example #9
0
def test_over_time(project_dir,
                   subset,
                   config_file,
                   first_year,
                   stage1_logfile=None,
                   penalty='l2',
                   suffix='',
                   model_type='LR',
                   loss='log',
                   objective='f1',
                   do_ensemble=True,
                   dh=100,
                   label='label',
                   intercept=True,
                   n_dev_folds=5,
                   verbose=False,
                   average='micro',
                   seed=None,
                   alpha_min=0.01,
                   alpha_max=1000.0,
                   n_alphas=8,
                   sample_labels=False,
                   group_identical=False,
                   annotated_subset=None,
                   n_terms=0,
                   nonlinearity='tanh',
                   init_lr=1e-4,
                   min_epochs=2,
                   max_epochs=100,
                   patience=8,
                   tol=1e-4,
                   early_stopping=False,
                   DL=False):
    # Just run a regular model, one per year, training on the past, and save the reults

    log = {
        'project': project_dir,
        'subset': subset,
        'config_file': config_file,
        'first_year': first_year,
        'stage1_logfile': stage1_logfile,
        'penalty': penalty,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'objective': objective,
        'do_ensemble': do_ensemble,
        'dh': dh,
        'label': label,
        'intercept': intercept,
        'n_dev_folds': n_dev_folds,
        'average': average,
        'seed': seed,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'n_alphas': n_alphas,
        'sample_labels': sample_labels,
        'group_identical': group_identical,
        'annotated_subset': annotated_subset,
        'n_terms': n_terms,
        'nonlinearity': nonlinearity,
        'init_lr': init_lr,
        'min_epochs': min_epochs,
        'max_epochs': max_epochs,
        'patience': patience,
        'tol': tol,
        'early_stopping': early_stopping
    }

    model_basename = make_model_basename(log)
    stage1_model_basename = ''
    if stage1_logfile is not None:
        stage1_log = fh.read_json(stage1_logfile)
        stage1_model_basename = make_model_basename(stage1_log)

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))

    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata['year'].values))
    field_vals.sort()
    print("Splitting data according to :", field_vals)

    # DEBUG:
    field_vals = ['2009']

    for target_year in field_vals:
        if int(target_year) >= first_year:
            print("\nTesting on %s" % target_year)
            model_name = model_basename + '_' + str(target_year)
            stage1_model_name = stage1_model_basename + '_' + str(target_year)
            # first, split into training and non-train data based on the field of interest

            ## DEBUG!
            test_selector_all = metadata['year'] >= int(target_year)
            test_subset_all = metadata[test_selector_all]
            test_items_all = test_subset_all.index.tolist()
            n_test_all = len(test_items_all)

            train_selector_all = metadata['year'] < int(target_year)
            train_subset_all = metadata[train_selector_all]
            train_items_all = list(train_subset_all.index)
            n_train_all = len(train_items_all)

            print("Test year: %d Train: %d, Test: %d (labeled and unlabeled)" %
                  (int(target_year), n_train_all, n_test_all))

            # load all labels
            label_dir = dirs.dir_labels(project_dir, subset)
            labels_df = fh.read_csv_to_df(os.path.join(label_dir,
                                                       label + '.csv'),
                                          index_col=0,
                                          header=0)
            n_items, n_classes = labels_df.shape

            vocab = None
            if stage1_logfile is not None:

                fightin_lexicon = None
                if annotated_subset is not None:
                    print("Determining fightin' words")
                    fightin_words.find_most_annotated_features(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=train_items_all,
                        remove_stopwords=False)
                    fightin_lexicon, scores = fightin_words.load_from_config_files(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=train_items_all,
                        n=n_terms,
                        remove_stopwords=True)
                    fightin_lexicon_test, scores = fightin_words.load_from_config_files(
                        project_dir,
                        annotated_subset,
                        subset,
                        config_file,
                        items_to_use=test_items_all,
                        n=n_terms,
                        remove_stopwords=True)
                    print(fightin_lexicon)
                    #print(fightin_lexicon_test)
                    #vocab = list(fightin_lexicon)
                    #vocab.sort()

                print("Loading feature from stage 1")
                # load features from previous model
                top_features = get_top_features.get_top_features(
                    os.path.join(dirs.dir_models(project_dir),
                                 stage1_model_name), n_terms)
                lr_features, weights = zip(*top_features)
                vocab = list(lr_features)

                #if annotated_subset is not None:
                #    print("\nTaking intersection:")
                #    intersection = set(lr_features).intersection(set(fightin_lexicon))
                #    vocab = list(intersection)
                #    vocab.sort()
                #    for w in vocab:
                #        print(w)

                #vocab = [w for w in vocab if w not in stopwords]

                for w in vocab:
                    print(w)

                vocab.sort()

                #if annotated_subset is not None:
                #    print("Missing:")
                #    print(set(fightin_lexicon_test) - set(vocab))

            # add in a stage to eliminate items with no labels
            print("Subsetting items with labels")
            label_sums_df = labels_df.sum(axis=1)
            labeled_item_selector = label_sums_df > 0
            labels_df = labels_df[labeled_item_selector]
            n_items, n_classes = labels_df.shape
            labeled_items = set(labels_df.index)

            train_items = [i for i in train_items_all if i in labeled_items]
            test_items = [i for i in test_items_all if i in labeled_items]
            n_train = len(train_items)
            n_test = len(test_items)

            weights_df = None
            if weights_df is not None:
                weights_df = weights_df[labeled_item_selector]

            print("Labeled train: %d, test: %d" % (n_train, n_test))

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'training data', 'test data',
                                         'cal', 'estimate', 'MAE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])

            test_labels_df = labels_df.loc[test_items]

            # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
            if sample_labels:
                print("Sampling labels")
                # normalize the labels
                temp = labels_df.values / np.array(
                    labels_df.values.sum(axis=1).reshape((n_items, 1)),
                    dtype=float)
                samples = np.zeros([n_items, n_classes], dtype=int)
                for i in range(n_items):
                    index = np.random.choice(np.arange(n_classes),
                                             size=1,
                                             p=temp[i, :])
                    samples[i, index] = 1
                sampled_labels_df = pd.DataFrame(samples,
                                                 index=labels_df.index,
                                                 columns=labels_df.columns)
            else:
                sampled_labels_df = labels_df

            train_labels_df = sampled_labels_df.loc[train_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            target_props, target_estimate, target_std = get_estimate_and_std(
                test_labels_df, use_n_annotations=True)

            output_df.loc['target'] = [
                n_test, 'test', 'test', 'n/a', target_estimate, 0,
                target_estimate - 2 * target_std,
                target_estimate + 2 * target_std, np.nan
            ]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(
                train_labels_df, use_n_annotations=True)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - target_estimate)**2)
            train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [
                n_train, 'train', 'test', 'n/a', train_estimate, train_rmse,
                train_estimate - 2 * train_std, train_estimate + 2 * train_std,
                train_contains_test
            ]

            #print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate))

            if train_estimate > 0.5:
                pos_label = 0
            else:
                pos_label = 1
            print("Using %d as the positive label" % pos_label)

            results_df = pd.DataFrame(
                [], columns=['f1', 'acc', 'mae', 'estimated calibration'])

            # Now train a model on the training data, saving the calibration data for calibration
            print("Training a model")
            model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(
                project_dir,
                model_type,
                loss,
                model_name,
                subset,
                sampled_labels_df,
                feature_defs,
                weights_df=weights_df,
                items_to_use=train_items,
                penalty='l2',
                alpha_min=alpha_min,
                alpha_max=alpha_max,
                n_alphas=n_alphas,
                intercept=intercept,
                objective=objective,
                n_dev_folds=n_dev_folds,
                do_ensemble=do_ensemble,
                dh=dh,
                seed=seed,
                pos_label=pos_label,
                vocab=vocab,
                group_identical=group_identical,
                nonlinearity=nonlinearity,
                init_lr=init_lr,
                min_epochs=min_epochs,
                max_epochs=max_epochs,
                patience=patience,
                tol=tol,
                early_stopping=early_stopping,
                verbose=verbose)
            results_df.loc['cross_val'] = [
                dev_f1, dev_acc, dev_cal_mae, dev_cal_est
            ]

            # predict on test data
            force_dense = False
            if model_type == 'MLP':
                force_dense = True
            test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose,
                force_dense=force_dense,
                group_identical=group_identical)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels_df,
                test_predictions_df,
                test_pred_probs_df,
                pos_label=pos_label,
                average=average)
            true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
            #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix())
            test_cal_est = evaluation.evaluate_calibration_rmse(
                true_test_vector,
                test_pred_probs_df.as_matrix(),
                min_bins=1,
                max_bins=1)
            test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

            test_cc_mae = np.mean(np.abs(test_cc_estimate[1] -
                                         target_estimate))
            test_pcc_mae = np.mean(
                np.abs(test_pcc_estimate[1] - target_estimate))

            results_df.loc['test'] = [
                f1_test, acc_test, test_pcc_mae, test_cal_est
            ]

            output_df.loc['CC_test'] = [
                n_train, 'train', 'test', 'n/a', test_cc_estimate[1],
                test_cc_mae, np.nan, np.nan, np.nan
            ]
            output_df.loc['PCC_test'] = [
                n_train, 'train', 'test', 'n/a', test_pcc_estimate[1],
                test_pcc_mae, np.nan, np.nan, np.nan
            ]

            test_acc_rmse_internal = np.sqrt(
                (test_acc_estimate_internal[1] - target_estimate)**2)
            test_pvc_rmse_internal = np.sqrt(
                (test_pvc_estimate_internal[1] - target_estimate)**2)

            output_df.loc['ACC_internal'] = [
                n_train, 'train', 'test', 'n/a', test_acc_estimate_internal[1],
                test_acc_rmse_internal, np.nan, np.nan, np.nan
            ]
            output_df.loc['PVC_internal'] = [
                n_train, 'train', 'nontrain', 'predicted',
                test_pvc_estimate_internal[1], test_pvc_rmse_internal, np.nan,
                np.nan, np.nan
            ]
            """
            if DL:
                print("Training a model")
                model_type = 'DL'
                DL_model_name = model_name + '_DL'
                model, _, _, _, _ = train.train_model_with_labels(project_dir, model_type, loss, DL_model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose)

                # predict on test data
                force_dense = False
                if model_type == 'MLP':
                    force_dense = True
                test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, DL_model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical)
                f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
                true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)

                #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix())
                test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
                test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

                test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate))
                test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate))

                output_df.loc['CC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan]
                output_df.loc['PCC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan]
            """

            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'accuracy.csv'))
            output_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))
Example #10
0
def make_random_split(input_file,
                      field_name,
                      calib_percent,
                      overwrite=False,
                      sampling='proportional'):
    """
    Split a dataset into multiple overlapping datasets based on some metadata variable (such as year)
    The idea is to create subsets to test domain adaptation / covariate shift
    For each value of the variable, create three datasets:
        train = all those items that don't have that value (training data)
        calib = random subset of items that do have that value (calibration data)
        test = remaining items that do have that value (evaluation data)
    :param input_file: 
    :param field_name: 
    :param calib_percent: 
    :param overwrite: 
    :param sampling: 
    :return: 
    """
    basedir = os.path.dirname(input_file)
    data = fh.read_json(input_file)
    field_vals = set([data[k][field_name] for k in data.keys()])

    if sampling == 'proportional':
        for val in field_vals:
            print(val)
            train = {
                k: v
                for k, v in data.items() if data[k][field_name] != val
            }
            subset = {
                k: v
                for k, v in data.items() if data[k][field_name] == val
            }

            keys = list(subset.keys())
            random.shuffle(keys)
            n_items = len(keys)
            print("Loaded %d items" % n_items)

            n_calib = int(n_items * calib_percent)
            calib = {k: data[k] for k in keys[:n_calib]}
            test = {k: data[k] for k in keys[n_calib:]}
            print(
                "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively"
                % (len(train), len(calib), len(test)))

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_train.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(train, output_file)

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_calib.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(calib, output_file)

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_test.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(test, output_file)

    else:
        keys = list(data.keys())
        random.shuffle(keys)
        n_items = len(keys)
        print("Loaded %d items" % n_items)

        n_calib = int(n_items * calib_percent)
        calib = {k: data[k] for k in keys[:n_calib]}
        test = {k: data[k] for k in keys[n_calib:]}

        for val in field_vals:
            print(val)
            train = {
                k: v
                for k, v in data.items() if data[k][field_name] != val
            }
            calib_subset = {
                k: v
                for k, v in calib.items() if calib[k][field_name] == val
            }
            test_subset = {
                k: v
                for k, v in test.items() if test[k][field_name] == val
            }
            print(
                "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively"
                % (len(train), len(calib_subset), len(test_subset)))

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_train.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(train, output_file)

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_calib.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(calib_subset, output_file)

            output_file = os.path.join(
                basedir, field_name + '_' + str(val) + '_test.json')
            if os.path.exists(output_file) and not overwrite:
                sys.exit("Error: output file %s exists" % output_file)
            fh.write_to_json(test_subset, output_file)
def extract_story_elements():
    min_head_vocab = 5
    min_role_vocab = 4
    min_tuples = 3

    ATTRIBUTE = 0
    AGENT_ROLE = 1
    PATIENT_ROLE = 2
    SURFACE_FORM = 3

    parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed')
    parsed_files = glob.glob(os.path.join(parsed_dir, '*.json'))

    dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json')
    dependencies = fh.read_json(dependencies_file)

    coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json')
    coref_heads = fh.read_json(coref_file)

    supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json'))

    heads = defaultdict(int)
    tokens = defaultdict(int)
    attributes = defaultdict(int)
    agent_roles = defaultdict(int)
    patient_roles = defaultdict(int)

    story_elements = {}
    print "Extracting story elements"
    for f_i, f in enumerate(parsed_files):
        sentences = fh.read_json(f)
        basename = fh.get_basename_wo_ext(f)
        element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename)
        story_elements[basename] = element_list
        for element in element_list:
            for h in element.head_words:
                heads[h] += 1
            for t in element.attributes:
                attributes[t] += 1
            for t in element.agent_roles:
                agent_roles[t] += 1
            for t in element.patient_roles:
                patient_roles[t] += 1

    print "Finding most common tokens"
    common_heads = [(v, k) for k, v in heads.items()]
    common_heads.sort()
    common_heads.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_heads.json')
    fh.write_to_json(common_heads, output_filename, sort_keys=False)

    """
    common_tokens = [(v, k) for k, v in tokens.items()]
    common_tokens.sort()
    common_tokens.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json')
    fh.write_to_json(common_tokens, output_filename, sort_keys=False)
    """

    common_attributes = [(v, k) for k, v in attributes.items()]
    common_attributes.sort()
    common_attributes.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json')
    fh.write_to_json(common_attributes, output_filename, sort_keys=False)

    common_agent_roles = [(v, k) for k, v in agent_roles.items()]
    common_agent_roles.sort()
    common_agent_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json')
    fh.write_to_json(common_agent_roles, output_filename, sort_keys=False)

    common_patient_roles = [(v, k) for k, v in patient_roles.items()]
    common_patient_roles.sort()
    common_patient_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json')
    fh.write_to_json(common_patient_roles, output_filename, sort_keys=False)

    print pronoun_list
    #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list}
    most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)}
    most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}
    most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}

    output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json')
    fh.write_to_json(most_common_attributes, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json')
    fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json')
    fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False)

    print len(most_common_attributes)
    print len(most_common_agent_roles)
    print len(most_common_patient_roles)

    print "Filtering tuples"
    valid_elements = defaultdict(list)
    for basename, element_list in story_elements.items():
        for se in element_list:
            se.valid_heads = [h for h in se.head_words if h not in pronoun_list]
            se.valid_phrases = [h for h in se.phrases if h not in pronoun_list]
            if len(se.valid_heads) > 0:
                se.valid_attributes = [t for t in se.attributes if t in most_common_attributes]
                se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles]
                se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles]
                se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \
                            [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \
                            [(PATIENT_ROLE, t) for t in se.valid_patient_roles]
                            #[(SURFACE_FORM, t) for t in se.valid_heads]

                if len(se.tuples) >= min_tuples:
                    valid_elements[basename].append(se)

    print "Constructing vocabulary"
    n_tuples = 0
    vocab = VocabWithCounts('', add_oov=False)
    n_entities = 0
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for role, token in se.tuples]
            vocab.add_tokens(tokens)
            n_tuples += len(tokens)
            n_entities += 1

    head_word_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for token in se.valid_heads]
            head_word_vocab.add_tokens(tokens)

    head_phrase_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for token in se.valid_phrases]
            head_phrase_vocab.add_tokens(tokens)

    print "Building indices"
    tuple_vocab = np.zeros(n_tuples, dtype=int)     # vocab index of the ith word
    tuple_entity = np.zeros(n_tuples, dtype=int)
    tuple_role = []
    entity_doc = np.zeros(n_entities, dtype=int)      # topic of the ith word
    docs = valid_elements.keys()
    docs.sort()

    vocab_counts = np.zeros(len(vocab), dtype=int)

    article_mapping = []
    entity_index = 0
    head_word_vocab_list = []
    head_word_entity_list = []
    head_phrase_vocab_list = []
    head_phrase_entity_list = []
    t_i = 0
    for d_i, d in enumerate(docs):
        element_list = valid_elements[d]
        for se in element_list:
            entity_doc[entity_index] = d_i
            for role, token in se.tuples:
                tuple_entity[t_i] = entity_index
                tuple_role.append(role)
                vocab_index = vocab.get_index(token)
                tuple_vocab[t_i] = vocab_index
                vocab_counts[vocab_index] += 1
                t_i += 1
            for token in se.valid_heads:
                head_word_vocab_index = head_word_vocab.get_index(token)
                head_word_vocab_list.append(head_word_vocab_index)
                head_word_entity_list.append(entity_index)
            for token in se.valid_phrases:
                head_phrase_vocab_index = head_phrase_vocab.get_index(token)
                head_phrase_vocab_list.append(head_phrase_vocab_index)
                head_phrase_entity_list.append(entity_index)

            article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles))
            entity_index += 1

    print len(docs), "valid documents"
    print entity_index, "entities"
    print t_i, "tuples"
    print len(vocab), "word types"
    print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json')
    fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json')
    fh.write_to_json(list(tuple_role), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json')
    fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json')
    fh.write_to_json(list(entity_doc), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'vocab.json')
    fh.write_to_json(vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'docs.json')
    fh.write_to_json(list(docs), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'article_map.json')
    fh.write_to_json(list(article_mapping), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json')
    fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json')
    fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json')
    fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json')
    fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json')
    fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json')
    fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
Example #12
0
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, train_prop=1.0, prefix=None, max_folds=None, min_val=None, max_val=None, model_type='LR', loss='log', do_ensemble=False, dh=0, label='label', penalty='l1', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1', seed=None, use_calib_pred=False, exclude_calib=False, alpha_min=0.01, alpha_max=1000, sample_labels=False):

    model_basename = subset + '_' + field_name
    if prefix is not None:
        model_basename = prefix + '_' + model_basename

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'calib_prop': calib_prop,
        'train_prop': train_prop,
        'prefix': prefix,
        'max_folds': max_folds,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'do_ensemble': do_ensemble,
        'label': label,
        'penalty': penalty,
        'cshift': cshift,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'pos_label': pos_label,
        'average': average,
        'use_calib_pred': use_calib_pred,
        'exclude_calib': exclude_calib
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load the file that contains metadata about each item
    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print(field_vals)

    # exclude certain values of the partition if desired
    if min_val is not None:
        field_vals = [v for v in field_vals if v >= float(min_val)]

    if max_val is not None:
        field_vals = [v for v in field_vals if v <= float(max_val)]

    if max_folds is None:
        max_folds = len(field_vals)

    # repeat the following value for each fold of the partition of interest (up to max_folds, if given)
    for v_i, v in enumerate(field_vals[:max_folds]):
        print("\nTesting on %s" % v)
        # first, split into training and non-train data based on the field of interest
        train_selector = metadata[field_name] != v
        train_subset = metadata[train_selector]
        train_items = list(train_subset.index)
        n_train = len(train_items)

        non_train_selector = metadata[field_name] == v
        non_train_subset = metadata[non_train_selector]
        non_train_items = non_train_subset.index.tolist()
        n_non_train = len(non_train_items)

        print("Train: %d, non-train: %d" % (n_train, n_non_train))

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
        n_items, n_classes = labels_df.shape
        train_labels = labels_df.loc[train_items]

        # if desired, attempt to learn weights for the training data using techniques for covariate shift
        if cshift is not None:
            print("Training a classifier for covariate shift")
            # start by learning to discriminate train from non-train data
            train_test_labels = np.zeros((n_items, 2), dtype=int)
            train_test_labels[train_selector, 0] = 1
            train_test_labels[non_train_selector, 1] = 1
            train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1])
            # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.)
            model_name = model_basename + '_' + str(v) + '_' + 'cshift'
            model, dev_f1, dev_acc, dev_cal, _, _ = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=False)
            print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc))

            # take predictions from model on the training data
            train_test_pred_df, train_test_probs_df = predict.predict(project_dir, model, model_name, subset, label, verbose=verbose)
            # display the min and max probs
            print("Min: %0.4f" % train_test_probs_df[1].min())
            print("Max: %0.4f" % train_test_probs_df[1].max())
            # use the estimated probability of each item being a training item to compute item weights
            weights = n_train / float(n_non_train) * (1.0/train_test_probs_df[0].values - 1)
            # print a summary of the weights from just the training items
            print("Min weight: %0.4f" % weights[train_selector].min())
            print("Ave weight: %0.4f" % weights[train_selector].mean())
            print("Max weight: %0.4f" % weights[train_selector].max())
            # print a summary of all weights
            print("Min weight: %0.4f" % weights.min())
            print("Ave weight: %0.4f" % weights.mean())
            print("Max weight: %0.4f" % weights.max())
            # create a data frame with this information
            weights_df = pd.DataFrame(weights, index=labels_df.index)
        else:
            weights_df = None

        # repeat the following process multiple times with different random splits of train / calibration / test data
        for r in range(repeats):

            # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items
            if train_prop < 1.0:
                np.random.shuffle(train_items)
                train_items_r = np.random.choice(train_items, size=int(n_train * train_prop), replace=False)
                n_train_r = len(train_items_r)

            # create a data frame to hold a summary of the results
            output_df = pd.DataFrame([], columns=['N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test'])
            # create a unique name ofr this model
            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # now, divide the non-train data into a calibration and a test set
            n_calib = int(calib_prop * n_non_train)
            np.random.shuffle(non_train_items)
            calib_items = non_train_items[:n_calib]
            test_items = non_train_items[n_calib:]
            n_test = len(test_items)

            print("%d %d %d" % (n_train_r, n_calib, n_test))
            test_labels_df = labels_df.loc[test_items]
            non_train_labels_df = labels_df.loc[non_train_items]

            # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
            if sample_labels:
                print("Sampling labels")
                # normalize the labels
                temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float)
                samples = np.zeros([n_items, n_classes], dtype=int)
                for i in range(n_items):
                    index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                    samples[i, index] = 1
                sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
            else:
                sampled_labels_df = labels_df

            train_labels_r_df = sampled_labels_df.loc[train_items_r].copy()
            calib_labels_df = sampled_labels_df.loc[calib_items].copy()

            # get the true proportion of labels in the test OR non-training data (calibration and test combined)
            if exclude_calib:
                test_props, test_estimate, test_std = get_estimate_and_std(test_labels_df)
            else:
                test_props, test_estimate, test_std = get_estimate_and_std(non_train_labels_df)
            output_df.loc['test'] = [n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1]

            # get the same estimate from training data
            train_props, train_estimate, train_std = get_estimate_and_std(train_labels_r_df)
            # compute the error of this estimate
            train_rmse = np.sqrt((train_estimate - test_estimate)**2)
            train_contains_test = test_estimate > train_estimate - 2 * train_std and test_estimate < train_estimate + 2 * train_std
            output_df.loc['train'] = [n_train_r, train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

            # repeat for calibration data
            calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df)
            calib_rmse = np.sqrt((calib_estimate - test_estimate)**2)
            # check if the test estimate is within 2 standard deviations of the estimate
            calib_contains_test = test_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std
            output_df.loc['calibration'] = [n_calib, calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test]

            results_df = pd.DataFrame([], columns=['f1', 'acc', 'cal'])

            print("Training model on all labeled data")
            # first train a model on the training and calibration data combined
            calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r))
            model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose)
            results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal]

            # get labels for test data
            test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
            results_df.loc['test_all'] = [f1_test, acc_test, 0.0]

            # combine the predictions on the test and calibration data (unless excluding calibration data from this)
            if exclude_calib:
                test_predictions = test_predictions_df.values
                test_pred_probs = test_pred_probs_df.values
            else:
                # get labels for calibration data
                if use_calib_pred:
                    calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose)
                else:
                    calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index)
                    # normalize labels to get (questionable) estimates of probabilities
                    calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index)

                test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values]
                test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values])

            # get the basic error estimates for this model
            cc_estimate = np.mean(test_predictions)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[:, 1])
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            output_df.loc['CC_all'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan]
            output_df.loc['PCC_all'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan]

            # Now repeat for a model trained on the training data, saving the calibration data for calibration
            print("Training model on training data only")
            model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max,  intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose)
            results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal]

            # predict on calibration data
            calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose)
            f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False)
            results_df.loc['calibration'] = [f1_cal, acc_cal, calib_rmse]

            # predict on test data
            test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
            results_df.loc['test'] = [f1_test, acc_test, 0.0]
            results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))

            # combine the predictions on the test and calibration data (unless excluding calibration data from this)
            if exclude_calib:
                test_predictions = test_predictions_df.values
                test_pred_probs = test_pred_probs_df.values
            else:
                if not use_calib_pred:
                    calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index)
                    # normalize labels to get (questionable) estimates of probabilities
                    calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index)

                test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values]
                test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values])

            # now evaluate in terms of predicted proportions
            # average the predictions (assuming binary labels)
            cc_estimate = np.mean(test_predictions)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[:, 1])
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            pcc_calib_estimate = np.mean(calib_pred_probs_df.values[:, 1])
            pcc_calib_rmse = np.sqrt((pcc_calib_estimate - calib_estimate)**2)

            output_df.loc['PCC_cal'] = [n_calib, pcc_calib_estimate, pcc_calib_rmse, np.nan, np.nan, np.nan]
            output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan]
            output_df.loc['PCC'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan]

            # expand the data so as to only have singly-labeled, weighted items
            _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values)

            # do some sort of calibration here (ACC, PACC, PVC)
            print("ACC correction")
            #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values)
            acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
            acc_corrected = calibration.apply_acc_binary(test_predictions, acc)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2)
            output_df.loc['ACC'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

            print("ACC internal")
            acc_corrected = calibration.apply_acc_binary(test_predictions, acc_cfm)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2)
            output_df.loc['ACC_int'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan]

            print("PVC correction")
            pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights)
            pvc_corrected = calibration.apply_pvc(test_predictions, pvc)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2)
            output_df.loc['PVC'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

            print("PVC internal")
            pvc_corrected = calibration.apply_pvc(test_predictions, pvc_cfm)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2)
            output_df.loc['PVC_int'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan]

            print("Venn")
            test_pred_ranges, calib_pred_ranges = ivap.estimate_probs_from_labels(project_dir, model, model_name, subset, subset, sampled_labels_df, calib_items, test_items, weights_df=None)

            if not exclude_calib:
                test_pred_ranges = np.vstack([test_pred_ranges, calib_pred_ranges])

            combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1])

            pred_range = np.mean(test_pred_ranges, axis=0)
            venn_estimate = np.mean(combo)

            venn_rmse = np.sqrt((venn_estimate - test_estimate)**2)
            venn_contains_test = pred_range[0] < test_estimate < pred_range[1]
            output_df.loc['Venn'] = [n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

            output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv')
            output_df.to_csv(output_filename)
Example #13
0
def cross_train_and_eval(project_dir, subset, config_file, n_train=500, suffix='', model_type='LR', loss='log', do_ensemble=True, dh=100, label='label', penalty='l1', intercept=True, n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='f1', seed=None, alpha_min=0.01, alpha_max=1000.0, sample_labels=False, run_all=False):

    field_name = 'nosplit'
    model_basename = subset + '_' + label + '_' + field_name + '_' + model_type + '_' + penalty
    if model_type == 'MLP':
        model_basename += '_' + str(dh)
    model_basename += '_' + str(n_train) + '_' + objective
    if sample_labels:
        model_basename += '_sampled'
    model_basename += suffix

    # save the experiment parameters to a log file
    logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': 'nosplit',
        'config_file': config_file,
        'n_train': n_train,
        'suffix': suffix,
        'model_type': model_type,
        'loss': loss,
        'dh': dh,
        'alpha_min': alpha_min,
        'alpha_max': alpha_max,
        'do_ensemble': do_ensemble,
        'label': label,
        'penalty': penalty,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'average': average,
        #'use_calib_pred': use_calib_pred,
        #'exclude_calib': exclude_calib,
        'sample_labels': sample_labels
    }
    fh.write_to_json(log, logfile)

    # load the features specified in the config file
    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    # load all labels
    label_dir = dirs.dir_labels(project_dir, subset)
    labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0)
    n_items, n_classes = labels_df.shape

    weights_df = None

    # eliminate items with no labels
    print("Subsetting items with labels")
    label_sums_df = labels_df.sum(axis=1)
    labeled_item_selector = label_sums_df > 0
    labels_df = labels_df[labeled_item_selector]
    n_items, n_classes = labels_df.shape
    labeled_items = list(set(labels_df.index))

    print("Starting repeats")
    # repeat the following process multiple times with different random splits of train / calibration / test data
    for r in range(repeats):
        print("* Repetition %d *" % r)
        # take a random subset of the training data
        np.random.shuffle(labeled_items)
        train_items = labeled_items[:n_train]
        test_items = labeled_items[n_train:]
        n_test = len(test_items)
        n_calib = 0

        # create a data frame to hold a summary of the results
        output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test'])
        # create a unique name ofr this model
        model_name = model_basename + '_' + 'nosplit' + '_' + str(r)

        print("Train: %d, calibration: %d, test: %d" % (n_train, n_calib, n_test))
        test_labels_df = labels_df.loc[test_items]

        # if instructed, sample labels in proportion to annotations (to simulate having one label per item)
        if sample_labels:
            print("Sampling labels")
            # normalize the labels
            temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float)
            samples = np.zeros([n_items, n_classes], dtype=int)
            for i in range(n_items):
                index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :])
                samples[i, index] = 1
            sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns)
        else:
            sampled_labels_df = labels_df

        train_labels_df = sampled_labels_df.loc[train_items].copy()

        # get the true proportion of labels in the test OR non-training data (calibration and test combined)
        target_props, target_estimate, target_std = combo.get_estimate_and_std(labels_df)
        output_df.loc['target'] = [n_test, 'n/a', 'all', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan]

        # get the same estimate from training data
        train_props, train_estimate, train_std = combo.get_estimate_and_std(train_labels_df)
        # compute the error of this estimate
        train_rmse = np.sqrt((train_estimate - target_estimate)**2)
        train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std
        output_df.loc['train'] = [n_train, 'train', 'train', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test]

        # do a test using the number of annotations rather than the number of items
        train_props2, train_estimate2, train_std2 = combo.get_estimate_and_std(train_labels_df, use_n_annotations=True)
        # compute the error of this estimate
        train_rmse2 = np.sqrt((train_estimate2 - target_estimate)**2)
        train_contains_test2 = target_estimate > train_estimate2 - 2 * train_std2 and target_estimate < train_estimate2 + 2 * train_std2
        output_df.loc['train_n_annotations'] = [n_train, 'train', 'train', 'n/a', train_estimate2, train_rmse2, train_estimate2 - 2 * train_std2, train_estimate2 + 2 * train_std2, train_contains_test2]

        print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate))

        if train_estimate > 0.5:
            pos_label = 0
        else:
            pos_label = 1
        print("Using %d as the positive label" % pos_label)

        results_df = pd.DataFrame([], columns=['f1', 'acc', 'calibration', 'calib overall'])

        # Now train a model on the training data, saving the calibration data for calibration
        print("Training model on training data only")
        model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max,  intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose)
        results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall]

        # predict on test data
        test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose)
        f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average)
        true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1)
        test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix())
        test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1)
        results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall]
        test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions

        # predict on calibration and test data combined
        all_predictions_df, all_pred_probs_df, all_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=labeled_items, verbose=verbose)
        all_cc_estimate, all_pcc_estimate, all_acc_estimate_internal, all_pvc_estimate_internal = all_pred_proportions

        cc_rmse = np.sqrt((all_cc_estimate[1] - target_estimate)**2)
        pcc_rmse = np.sqrt((all_pcc_estimate[1] - target_estimate)**2)

        output_df.loc['CC_all'] = [n_items, 'train', 'all', 'predicted', all_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_all'] = [n_items, 'train', 'all', 'predicted', all_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan]

        averaged_cc_estimate = (test_cc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train)
        averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train)

        averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2)
        averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2)

        output_df.loc['CC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan]
        output_df.loc['PCC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan]

        all_acc_rmse_internal = np.sqrt((all_acc_estimate_internal[1] - target_estimate) ** 2)
        all_pvc_rmse_internal = np.sqrt((all_pvc_estimate_internal[1] - target_estimate) ** 2)

        output_df.loc['ACC_internal'] = [n_items, 'train', 'all', 'predicted', all_acc_estimate_internal[1], all_acc_rmse_internal, np.nan, np.nan, np.nan]
        output_df.loc['PVC_internal'] = [n_items, 'train', 'all', 'predicted', all_pvc_estimate_internal[1], all_pvc_rmse_internal, np.nan, np.nan, np.nan]

        print("Venn internal all")
        all_pred_ranges_internal, all_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, labeled_items, plot=False)

        pred_range = np.mean(all_pred_ranges_internal, axis=0)
        venn_estimate = np.mean(all_preds_internal)

        venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)
        venn_contains_test = pred_range[0] < target_estimate < pred_range[1]
        output_df.loc['Venn_internal'] = [n_items, 'train', 'all', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test]

        print("Venn internal test")
        test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items)

        pred_range = np.mean(test_pred_ranges_internal, axis=0)
        venn_estimate = (np.mean(test_preds_internal) * n_test + train_estimate * n_train) / float(n_test + n_train)
        venn_rmse = np.sqrt((venn_estimate - target_estimate)**2)

        averaged_lower = (pred_range[0] * n_test + (train_estimate - 2 * train_std) * n_train) / float(n_test + n_train)
        averaged_upper = (pred_range[1] * n_test + (train_estimate + 2 * train_std) * n_train) / float(n_test + n_train)
        venn_contains_test = averaged_lower < target_estimate < averaged_upper

        output_df.loc['Venn_internal_averaged'] = [n_items, 'train', 'all', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test]

        results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv'))
        output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
Example #14
0
def cross_train_and_eval(project_dir,
                         subset,
                         field_name,
                         config_file,
                         calib_prop=0.33,
                         nontest_prop=1.0,
                         prefix=None,
                         max_folds=None,
                         model_type='LR',
                         label='label',
                         penalty='l2',
                         cshift=None,
                         intercept=True,
                         n_dev_folds=5,
                         repeats=1,
                         verbose=False,
                         pos_label=1,
                         average='micro',
                         objective='f1'):

    model_basename = subset + '_' + field_name
    if prefix is not None:
        model_basename = prefix + '_' + model_basename

    logfile = os.path.join(dirs.dir_logs(project_dir),
                           model_basename + '.json')
    fh.makedirs(dirs.dir_logs(project_dir))
    log = {
        'project': project_dir,
        'subset': subset,
        'field_name': field_name,
        'config_file': config_file,
        'calib_prop': calib_prop,
        'train_prop': nontest_prop,
        'prefix': prefix,
        'max_folds': max_folds,
        'model_type': model_type,
        'label': label,
        'penalty': penalty,
        'cshift': cshift,
        'intercept': intercept,
        'objective': objective,
        'n_dev_folds': n_dev_folds,
        'repeats': repeats,
        'pos_label': pos_label,
        'average': average
    }
    fh.write_to_json(log, logfile)

    config = fh.read_json(config_file)
    feature_defs = []
    for f in config['feature_defs']:
        feature_defs.append(features.parse_feature_string(f))

    weights_file = None

    metadata_file = os.path.join(dirs.dir_subset(project_dir, subset),
                                 'metadata.csv')
    metadata = fh.read_csv_to_df(metadata_file)
    field_vals = list(set(metadata[field_name].values))
    field_vals.sort()
    print(field_vals)

    if max_folds is None:
        max_folds = len(field_vals)

    for v_i, v in enumerate(field_vals[:max_folds]):

        print("\nTesting on %s" % v)
        nontest_selector = metadata[field_name] != v
        nontest_subset = metadata[nontest_selector]
        nontest_items = list(nontest_subset.index)
        n_nontest = len(nontest_items)

        test_selector = metadata[field_name] == v
        test_subset = metadata[test_selector]
        test_items = test_subset.index.tolist()
        n_test = len(test_items)

        # load all labels
        label_dir = dirs.dir_labels(project_dir, subset)
        labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'),
                                      index_col=0,
                                      header=0)
        n_items, n_classes = labels_df.shape

        # subsample the non-test items if desired
        if nontest_prop < 1.0:
            np.random.shuffle(nontest_items)
            nontest_items = np.random.choice(nontest_items,
                                             size=int(n_nontest *
                                                      nontest_prop),
                                             replace=False)
            n_nontest = len(nontest_items)

        nontest_labels = labels_df.loc[nontest_items]

        if cshift is not None:
            print("Training a classifier for covariate shift")
            # start by learning to discriminate test from non-test data
            train_test_labels = np.zeros((n_items, 2), dtype=int)
            train_test_labels[nontest_selector, 0] = 1
            train_test_labels[test_selector, 1] = 1
            train_test_labels_df = pd.DataFrame(train_test_labels,
                                                index=labels_df.index,
                                                columns=[0, 1])
            model_name = model_basename + '_' + str(v) + '_' + 'cshift'
            model, dev_f1, dev_cal, _, _ = train.train_model_with_labels(
                project_dir,
                model_type,
                model_name,
                subset,
                train_test_labels_df,
                feature_defs,
                penalty=penalty,
                intercept=intercept,
                n_dev_folds=n_dev_folds,
                verbose=False)

            train_test_pred_df, train_test_probs_df = predict.predict(
                project_dir, model, model_name, subset, label, verbose=verbose)
            print("Min: %0.4f" % train_test_probs_df[1].min())
            print("Max: %0.4f" % train_test_probs_df[1].max())
            # base the weights on the probability of each item being a training item
            weights = n_nontest / float(n_test) * (
                1.0 / train_test_probs_df[0].values - 1)
            print("Min weight: %0.4f" % weights[nontest_selector].min())
            print("Ave weight: %0.4f" % weights[nontest_selector].mean())
            print("Max weight: %0.4f" % weights[nontest_selector].max())
            print("Min weight: %0.4f" % weights.min())
            print("Ave weight: %0.4f" % weights.mean())
            print("Max weight: %0.4f" % weights.max())
            weights_df = pd.DataFrame(weights, index=labels_df.index)
        else:
            weights_df = None

        # repeat the following process multiple times with different random splits of calibration / test data
        for r in range(repeats):
            output_df = pd.DataFrame([],
                                     columns=[
                                         'N', 'estimate', 'RMSE', '95lcl',
                                         '95ucl', 'contains_test'
                                     ])

            model_name = model_basename + '_' + str(v) + '_' + str(r)

            # split the non-test items into train and calibration
            n_calib = int(n_nontest * calib_prop)
            np.random.shuffle(nontest_items)
            calib_items = nontest_items[:n_calib]
            train_items = nontest_items[n_calib:]

            train_labels = labels_df.loc[train_items]
            calib_labels = labels_df.loc[calib_items]
            test_labels = labels_df.loc[test_items]

            # get the label proportions from the test and non-test data
            test_props, test_estimate, test_std = get_estimate_and_std(
                test_labels)
            output_df.loc['test'] = [
                n_test, test_estimate, 0, test_estimate - 2 * test_std,
                test_estimate + 2 * test_std, 1
            ]

            nontest_props, nontest_estimate, nontest_std = get_estimate_and_std(
                nontest_labels)
            nontest_rmse = np.sqrt((nontest_estimate - test_estimate)**2)
            nontest_contains_test = test_estimate > nontest_estimate - 2 * nontest_std and test_estimate < nontest_estimate + 2 * nontest_std
            output_df.loc['nontest'] = [
                n_nontest, nontest_estimate, nontest_rmse,
                nontest_estimate - 2 * nontest_std,
                nontest_estimate + 2 * nontest_std, nontest_contains_test
            ]

            # train a model
            print("Doing training")
            model, dev_f1, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(
                project_dir,
                model_type,
                model_name,
                subset,
                labels_df,
                feature_defs,
                weights_df=weights_df,
                items_to_use=train_items,
                penalty=penalty,
                intercept=intercept,
                objective=objective,
                n_dev_folds=n_dev_folds,
                verbose=verbose)

            # predict on the calibration and test sets
            print("Doing prediction on calibration items")
            calib_predictions, calib_pred_probs = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=calib_items,
                verbose=verbose)

            print("Doing prediction on test items")
            test_predictions, test_pred_probs = predict.predict(
                project_dir,
                model,
                model_name,
                subset,
                label,
                items_to_use=test_items,
                verbose=verbose)

            # evaluate the model on the calibration and test data
            print("Doing evaluation")
            f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(
                calib_labels,
                calib_predictions,
                pos_label=pos_label,
                average=average)
            f1_test, acc_test = evaluate_predictions.evaluate_predictions(
                test_labels,
                test_predictions,
                pos_label=pos_label,
                average=average)
            results_df = pd.DataFrame([], columns=['f1', 'acc'])
            results_df.loc['calibration'] = [f1_cal, acc_cal]
            results_df.loc['test'] = [f1_test, acc_test]
            results_df.to_csv(
                os.path.join(dirs.dir_models(project_dir), model_name,
                             'results.csv'))

            # first check results without any correction
            # average the preditions (assuming binary labels)
            cc_estimate = np.mean(test_predictions[label].values)
            cc_rmse = np.sqrt((cc_estimate - test_estimate)**2)

            # average the predicted probabilities for the positive label (assuming binary labels)
            pcc_estimate = np.mean(test_pred_probs[1].values)
            pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2)

            output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, 0, 1, np.nan]
            output_df.loc['PCC'] = [
                n_test, pcc_estimate, pcc_rmse, 0, 1, np.nan
            ]

            # do the two basic corrections, based on the calibration data
            print("ACC internal")
            calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(
                calib_labels.values, calib_predictions.values)
            acc = calibration.compute_acc(calib_labels_expanded,
                                          calib_predictions_expanded,
                                          n_classes, calib_weights_expanded)
            acc_corrected = calibration.apply_acc_binary(
                test_predictions.values, acc)
            acc_estimate = acc_corrected[1]
            acc_rmse = np.sqrt((acc_estimate - test_estimate)**2)
            output_df.loc['ACC_int'] = [
                n_calib, acc_estimate, acc_rmse, 0, 1, np.nan
            ]

            print("PVC internal")
            pvc = calibration.compute_pvc(calib_labels_expanded,
                                          calib_predictions_expanded,
                                          n_classes,
                                          weights=calib_weights_expanded)
            pvc_corrected = calibration.apply_pvc(test_predictions.values, pvc)
            pvc_estimate = pvc_corrected[1]
            pvc_rmse = np.sqrt((pvc_estimate - test_estimate)**2)
            output_df.loc['PVC_int'] = [
                n_calib, pvc_estimate, pvc_rmse, 0, 1, np.nan
            ]

            # do IVAP for calibration
            print("Venn")
            test_pred_ranges = ivap.estimate_probs_from_labels(
                project_dir,
                model,
                model_name,
                subset,
                subset,
                labels_df,
                calib_items,
                test_items,
                weights_df=weights_df)
            combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] +
                                              test_pred_ranges[:, 1])

            pred_range = np.mean(test_pred_ranges, axis=0)
            venn_estimate = np.mean(combo)
            venn_rmse = np.sqrt((venn_estimate - test_estimate)**2)
            venn_contains_test = pred_range[0] < test_estimate < pred_range[1]
            output_df.loc['Venn'] = [
                n_calib, venn_estimate, venn_rmse, pred_range[0],
                pred_range[1], venn_contains_test
            ]

            output_filename = os.path.join(dirs.dir_models(project_dir),
                                           model_name,
                                           field_name + '_' + str(v) + '.csv')
            output_df.to_csv(output_filename)
Example #15
0
def identify_rnn_targets(output_data_filename):
    min_head_vocab = 5
    min_role_vocab = 4
    min_tuples = 3

    ATTRIBUTE = 0
    AGENT_ROLE = 1
    PATIENT_ROLE = 2
    SURFACE_FORM = 3

    parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed')
    parsed_files = glob.glob(os.path.join(parsed_dir, '*.json'))

    dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json')
    dependencies = fh.read_json(dependencies_file)

    coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json')
    coref_heads = fh.read_json(coref_file)

    supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json'))

    heads = defaultdict(int)
    tokens = defaultdict(int)
    attributes = defaultdict(int)
    agent_roles = defaultdict(int)
    patient_roles = defaultdict(int)

    story_elements = {}
    print "Extracting story elements"
    for f_i, f in enumerate(parsed_files):
        sentences = fh.read_json(f)
        basename = fh.get_basename_wo_ext(f)
        print f
        element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename)
        story_elements[basename] = element_list
        for element in element_list:
            for h in element.head_words:
                heads[h] += 1
            for t in element.attributes:
                attributes[t] += 1
            for t in element.agent_roles:
                agent_roles[t] += 1
            for t in element.patient_roles:
                patient_roles[t] += 1

    print "Finding most common tokens"
    common_heads = [(v, k) for k, v in heads.items()]
    common_heads.sort()
    common_heads.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_heads.json')
    fh.write_to_json(common_heads, output_filename, sort_keys=False)

    """
    common_tokens = [(v, k) for k, v in tokens.items()]
    common_tokens.sort()
    common_tokens.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json')
    fh.write_to_json(common_tokens, output_filename, sort_keys=False)
    """

    common_attributes = [(v, k) for k, v in attributes.items()]
    common_attributes.sort()
    common_attributes.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json')
    fh.write_to_json(common_attributes, output_filename, sort_keys=False)

    common_agent_roles = [(v, k) for k, v in agent_roles.items()]
    common_agent_roles.sort()
    common_agent_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json')
    fh.write_to_json(common_agent_roles, output_filename, sort_keys=False)

    common_patient_roles = [(v, k) for k, v in patient_roles.items()]
    common_patient_roles.sort()
    common_patient_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json')
    fh.write_to_json(common_patient_roles, output_filename, sort_keys=False)

    print pronoun_list
    #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list}
    most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)}
    most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}
    most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}

    output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json')
    fh.write_to_json(most_common_attributes, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json')
    fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json')
    fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False)

    print len(most_common_attributes)
    print len(most_common_agent_roles)
    print len(most_common_patient_roles)

    print "Filtering tuples"
    valid_elements = defaultdict(list)
    for basename, element_list in story_elements.items():
        for se in element_list:
            # need at least one head word that is not a pronoun
            se.valid_heads = [h for h in se.head_words if h not in pronoun_list]
            if len(se.valid_heads) > 0:
                se.valid_attributes = [t for t in se.attributes if t in most_common_attributes]
                se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles]
                se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles]
                se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \
                            [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \
                            [(PATIENT_ROLE, t) for t in se.valid_patient_roles]
                #[(SURFACE_FORM, t) for t in se.valid_heads]

                if len(se.tuples) >= min_tuples:
                    valid_elements[basename].append(se)


    output_data = []
    for basename, element_list in valid_elements.items():
        used_sentences = set()
        for se in element_list:
            for i in range(len(se.head_indices)):
                assert se.head_indices[i] < len(se.sentences[i].split())
                if se.head_words[i] not in pronoun_list:
                    if se.sentences[i] not in used_sentences:
                        output_data.append((se.head_indices[i], se.sentences[i], basename))
                        # THIS IS TRYING SOMETHING NEW...
                        used_sentences.add(se.sentences[i])

    with codecs.open(output_data_filename, 'w', encoding='utf-8') as output_file:
        json.dump(output_data, output_file, indent=2, sort_keys=False)

    """
Example #16
0
def cluster_entities(entities_file):
    # read the entities grouped by coref
    groups = fh.read_json(entities_file)

    print len(groups)

    # remove pronouns
    pronoun_list = ['he', 'his', 'it', 'they', 'their', 'He', 'It', 'I', 'them', 'him',
                    'its', 'her', 'she', 'They', 'That', 'His', 'we', 'We', 'that', 'she',
                    'my', 'me', 'our', 'himself', 'This', 'themselves', 'Her', 'Their',
                    'us', 'My', 'you', 'itself', 'this', 'Its', 'Our', 'herself', 'myself',
                    'You', 'These', 'those', 'your', "'s'", 'She', 'i']
    pronoun_list.sort()

    for p in pronoun_list:
        if p in groups:
            groups.pop(p)
        for subgroup in groups.values():
            if p in subgroup:
                subgroup.pop(p)


    # count the number of corefs for each group
    group_sizes = {k: np.sum(v.values()) for k, v in groups.items()}

    #keys = refs.keys()
    #vals = refs.values()

    # sort by the number of corefs for each type
    #order = np.argsort(vals).tolist()
    #order.reverse()

    # assign each entity to the group that it corefs with the most times
    entities = {}
    for group, corefs in groups.items():
        for name, count in corefs.items():
            if name not in entities:
                entities[name] = (group, count)
            elif count > entities[name][1]:
                entities[name] = (group, count)


    #names = entities.keys()
    #counts = [c for r, c in entities.values()]
    #order = np.argsort(counts).tolist()
    #order.reverse()

    #for i in range(40):
    #    name = names[order[i]]
    #    print name, entities[name]

    # for each entity in a document, map it to the group that it corefs with the most
    # choose the group that has the largest total counts
    # replace all references with that group name

    sample_document = os.path.join(dirs.data_stanford_dir, 'xml', 'Immigration1.0-24.txt.xml')

    sentences, _, _, _, _, doc_groups= stanford.parse_xml_output(sample_document)

    replacements = []
    for doc_group in doc_groups:
        counts = {}
        for ref, sentence, start, end in doc_group:
            ref = ref.lower()
            if ref in entities:
                group, count = entities[ref]
                if group in counts:
                    counts[group] += count
                else:
                    counts[group] = count
        group_list = counts.keys()
        count_list = counts.values()
        order = np.argsort(count_list).tolist()
        order.reverse()
        replacements.append(group_list[order[0]])
        print doc_group, counts