Ejemplo n.º 1
0
def train_full_model(cfm_dev, cfm_test, best_alphas, exp_dir=None, name=None, label_file=None, target=None, feature_list=None,
                     model_type=None, n_eval_iters=None, eval_prop=None,
                     min_alpha_exp=-4, max_alpha_exp=5, alpha_exp_base=np.sqrt(10),
                     reuse=False, orig_T=0.04, tau=0.01, verbose=1, best_alpha=None,
                     weight_col=-1, additional_label_files=None,
                     additional_label_weights=None, predict_all=False, metric='f1', only_unanimous=True,
                     **kwargs):

    # load the labels
    all_items, target_name, labels, weights, unanimous_pairs = lr.get_labels(label_file, target, weight_col=weight_col)
    n_labels = np.max(labels) + 1

    n_dev_folds = ds.get_n_dev_folds()

    if only_unanimous:
        unanimous_subset = set(unanimous_pairs.keys())
        print len(unanimous_subset)
    else:
        unanimous_subset = None

    # create experiments directory and save the parameters for this experiment
    print exp_dir

    # load the features
    print "Loading features"
    _, feature_names, X = exp2.load_features(feature_list, all_items, verbose=verbose)

    # deal with loading items for pseudo-documents (i.e. partial documents, dependent on annotations)
    if additional_label_files is not None:
        print "Loading features for pseudodocuments"
        extra_items = []
        extra_labels = []
        extra_weights = []
        extra_Xs = []
        full_doc_index = {}
        for fi, f in enumerate(additional_label_files):
            basename = fh.get_basename_wo_ext(f)
            # assume the last part of the label file is a unique type
            parts = basename.split('_')
            pseudotype = parts[-1]
            # get the factor by which to multiply the weights for these items
            weight_factor = additional_label_weights[fi]
            # get the index which maps psueodocuments to real documents
            doc_index = fh.read_json(fh.make_filename(dirs.data_raw_index_dir, basename + '_index', 'json'))
            # update a global index
            full_doc_index.update(doc_index)
            # get the labels for these pseudodocuments
            f_items, f_target_name, f_labels, f_weights, _ = lr.get_labels(f, target, weight_col=weight_col)
            # add to our overall list of items, labels, and weights (for pseudodocuments)
            extra_items.extend(f_items)
            extra_labels.append(f_labels)
            extra_weights.append(weight_factor * f_weights)
            # create updated feature types for these pseudodocuments
            pseudo_feature_list = [f + ',pseudotype=' + pseudotype for f in feature_list]
            # load these featues
            _, _, extra_X = exp2.load_features(pseudo_feature_list, f_items, doc_index=doc_index, verbose=verbose)
            # put them in a list to make one big matrix of pseudo-documents
            extra_Xs.append(extra_X)
        # concatenate the numeric values from each pseudo-document
        extra_labels = np.concatenate(extra_labels)
        extra_weights = np.concatenate(extra_weights)
        extra_X = sparse.vstack(extra_Xs)
    else:
        extra_items = None
        extra_labels = None
        extra_weights = None
        extra_X = None
        full_doc_index = None


    n_test_folds = ds.get_n_test_folds()
    models = {}
    alphas = np.linspace(np.min(best_alphas), np.max(best_alphas), 10).tolist()
    f1_sums = np.zeros(len(alphas))
    for test_fold in range(n_test_folds):
        print "Tuning hyperparameters"

        model = SparseModel(model_type=model_type, column_names=feature_names, metric=metric, **kwargs)

        if additional_label_files is not None:
            valid_f1s, best_alpha = model.tune_by_cv(X, labels, all_items, alphas, test_fold, n_dev_folds,
                                                     sample_weights=weights, extra_X=extra_X, extra_y=extra_labels,
                                                     extra_items=extra_items, extra_weights=extra_weights,
                                                     doc_index=full_doc_index,
                                                     dev_test_subset=unanimous_subset, verbose=verbose)
        else:
            valid_f1s, best_alpha = model.tune_by_cv(X, labels, all_items, alphas, test_fold, n_dev_folds,
                                                     sample_weights=weights,
                                                     dev_test_subset=unanimous_subset, verbose=verbose)

        f1_sums += np.mean(valid_f1s.as_matrix(), axis=0)
        print target, '; y_sum = ' + str(labels.sum()) + '; best alpha =', best_alpha, \
            "; valid f1 = ", valid_f1s.mean(axis=0)[str(best_alpha)]

    for a_i, a in enumerate(alphas):
        print a, f1_sums[a_i] / n_test_folds

    best_index = np.argmax(f1_sums)
    best_alpha = alphas[best_index]

    params_list = [name, exp_dir, label_file, target, target_name, feature_list, model_type,
                   min_alpha_exp, max_alpha_exp, alpha_exp_base, weight_col,
                   reuse, orig_T, tau, best_alpha, additional_label_files, additional_label_weights,
                   metric, only_unanimous] + kwargs.values()
    params_names = """name, exp_dir, label_file, target, target_name, feature_list, model_type,
                    min_alpha_exp, max_alpha_exp, alpha_exp_base, weight_col,
                    reuse, orig_T, tau, best_alpha, additional_label_files, additional_label_weights,
                    metric, only_unanimous"""

    for key in kwargs.keys():
        params_names += ', ' + key
    write_log(exp_dir, params_names, params_list)



    # use a set number of random splits to evaluate dev performance
    print "Estimating hold-out performance"
    valid_f1s = []
    valid_f1s_unan = []
    validation_loss_vector = []
    validation_items_list = []

    model = SparseModel()
    model = SparseModel(model_type=model_type, column_names=feature_names, metric=metric, alpha=float(best_alpha),
                        **kwargs)

    # use a set number of random splits to evaluate dev performance
    print "Estimating hold-out performance"
    valid_f1s = []
    valid_f1s_unan = []
    masked_f1s = []
    validation_loss_vector = []
    validation_items_list = []

    confusion_matrix_sum = np.zeros([n_labels, n_labels])

    for dev_fold in range(n_eval_iters):

        n_eval = int(round(eval_prop * len(all_items)))
        eval_items = set(np.random.choice(all_items, n_eval, replace=False).tolist())
        train_items = set(all_items) - eval_items

        if only_unanimous:
            unan_eval_items = list(set(eval_items).intersection(unanimous_subset))
            unan_train_items = list(set(train_items).intersection(unanimous_subset))
        else:
            unan_eval_items = eval_items
            unan_train_items = train_items

        eval_indices = exp2.get_indices(all_items, eval_items)
        train_indices = exp2.get_indices(all_items, train_items)

        unan_eval_indices = exp2.get_indices(all_items, unan_eval_items)
        unan_train_indices = exp2.get_indices(all_items, unan_train_items)

        if additional_label_files is not None:
            predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels,
                                                   sample_weights=weights, extra_X=extra_X, extra_y=extra_labels,
                                                   extra_items=extra_items, extra_weights=extra_weights,
                                                   doc_index=full_doc_index, verbose=1)

        else:
            predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels,
                                                   sample_weights=weights, verbose=1)

        #validation_loss_vector.extend(get_loss_vector(labels, predictions, weights, eval_indices))
        #validation_items_list.extend([i for index, i in enumerate(all_items) if index in eval_indices])
        train_macro_f1, valid_macro_f1 = exp2.evaluate_predictions(labels, predictions, eval_indices,
                                                                                     train_indices,
                                                                                     weights=weights, metric=metric)

        train_macro_f1_unan, valid_macro_f1_unan = exp2.evaluate_predictions(labels, predictions,
                                                                                                    unan_eval_indices,
                                                                                                    unan_train_indices,
                                                                                                    weights=weights,
                                                                                                    metric=metric)

        confusion_matrix = exp2.compute_confution_matrix(labels, weights, predictions, eval_indices, n_labels)
        confusion_matrix_sum += confusion_matrix

        valid_f1s_unan.append(valid_macro_f1_unan)
        valid_f1s.append(valid_macro_f1)

        print("Fold %d: train %s: %f; train unan %s: %f; valid %s: %f; valid unan %s: %f"
              % (dev_fold, metric, train_macro_f1, metric, train_macro_f1_unan, metric, valid_macro_f1, metric, valid_macro_f1_unan))
        print confusion_matrix

    print "Mean unan dev f1 =", np.mean(valid_f1s_unan)

    #fh.write_to_json(zip(validation_items_list, validation_loss_vector), fh.make_filename(exp_dir, 'loss_vector', 'json'))
    fh.write_to_json(valid_f1s_unan, fh.make_filename(exp_dir, 'dev_f1s_unan', 'json'))
    fh.write_to_json(valid_f1s, fh.make_filename(exp_dir, 'dev_f1s', 'json'))
    fh.write_to_json(masked_f1s, fh.make_filename(exp_dir, 'dev_f1s_masked', 'json'))

    # finally, train one final model
    print "Training final model"
    #train_dict, valid_dict, test_dict = get_item_dicts(datasets, test_fold, dev_subfold)
    train_items = ds.get_all_documents()

    if only_unanimous:
        unan_train_items = list(set(train_items).intersection(unanimous_subset))
    else:
        unan_train_items = train_items

    train_indices = exp2.get_indices(all_items, train_items)

    unan_train_indices = exp2.get_indices(all_items, unan_train_items)

    if verbose > 0:
        n, p = X.shape
        print ' n_train =', len(train_items), '; n_features =', p

    if additional_label_files is not None:
        predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels,
                                               sample_weights=weights, extra_X=extra_X, extra_y=extra_labels,
                                               extra_items=extra_items, extra_weights=extra_weights,
                                               doc_index=full_doc_index, verbose=1)

    else:
        predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels,
                                               sample_weights=weights, verbose=1)

    train_f1, test_f1 = exp2.evaluate_predictions(labels, predictions, train_indices,
                                                             train_indices,
                                                             weights=weights, metric='f1')

    train_acc, test_acc = exp2.evaluate_predictions(labels, predictions, train_indices,
                                                                train_indices,
                                                                weights=weights, metric='acc')

    unan_train_f1, unan_test_f1 = exp2.evaluate_predictions(labels, predictions, train_indices,
                                                                            unan_train_indices,
                                                                            weights=weights, metric='f1')

    #test_loss_vector = get_loss_vector(labels, predictions, weights, test_indices)
    #fh.write_to_json(zip(test_items, test_loss_vector), fh.make_filename(exp_dir, 'loss_vector_test', 'json'))

    evaluation.write_prediction_errors(labels, predictions, train_indices, all_items, weights,
                                       fh.make_filename(exp_dir, 'train_errors', 'csv'))
    #evaluation.write_prediction_errors(labels, predictions, test_indices, all_items, weights,
    #                                   fh.make_filename(exp_dir, 'test_errors', 'csv'))

    output = pd.DataFrame(predictions, index=all_items, columns=[target_name])
    output.to_csv(fh.make_filename(exp_dir, 'predictions', 'csv'))

    output = pd.DataFrame(probs, index=all_items, columns=[target_name])
    output.to_csv(fh.make_filename(exp_dir, 'probs', 'csv'))

    fh.write_to_json({'train_f1': train_f1, 'train_acc': train_acc, 'train_f1_subset': unan_train_f1,
                      'mean_dev_f1_subset': float(np.mean(valid_f1s_unan))},
                     fh.make_filename(exp_dir, 'results', 'json'))

    model.write_to_file(fh.make_filename(exp_dir, 'model', 'json'))

    if predict_all:
        data = fh.read_json(dirs.data_processed_text_file)
        all_items = data.keys()
        all_items.sort()
        _, feature_names, X = exp2.load_features(feature_list, None, all_items, stage='test', verbose=verbose)
        predictions = model.predict(X)
        df = pd.DataFrame(predictions, index=all_items, columns=[target_name])
        df.to_csv(fh.make_filename(exp_dir, 'full_predictions', 'csv'))

        if model_type == 'LR':
            probs = model.predict_max_probs(X)
            df = pd.DataFrame(probs, index=all_items, columns=[target_name])
            df.to_csv(fh.make_filename(exp_dir, 'full_predictions_probs', 'csv'))

    if only_unanimous:
        loss = float(-np.mean(valid_f1s_unan))
    else:
        loss = float(-np.mean(valid_f1s))

    mean_confusion_matrix = exp2.normalize_confusion_matrix(confusion_matrix_sum)

    print "dev matrix"
    print cfm_dev
    print "test matrix"
    print cfm_test
    print "new matrix"
    print mean_confusion_matrix

    proportions_eval = {'confusion_matrix': mean_confusion_matrix.tolist()}
    fh.write_to_json(proportions_eval, fh.make_filename(exp_dir, 'proportions', 'json'))

    print("Overall: train f1: %f; mean valid f1 unan: %f; f1 unan: %f;" % (unan_train_f1, -loss, unan_test_f1))
Ejemplo n.º 2
0
def main():

    usage = "%prog old_project old_exp_dir new_processed_dir old_splits_file output_dir"
    parser = OptionParser(usage=usage)
    parser.add_option('-f', dest='test_fold', default=0,
                      help='Test fold: default= old name + _rerun')
    parser.add_option('-n', dest='new_name', default=None,
                      help='New name for experiment: default= old name + _rerun')
    parser.add_option('-a', dest='alternate_label_file', default=None,
                      help='Alternate label file: default= old name + _rerun')
    parser.add_option('--sentences', action="store_true", dest="sentences", default=False,
                      help='Predict at the level of sentences (instead of documents): default=%default')

    (options, args) = parser.parse_args()
    #args = ['mfc_labeled', '/Users/dcard/Projects/CMU/ARK/guac/datasets/mfc_labeled/experiments/test_fold_0/primary/primary_test/', '/Users/dcard/Projects/CMU/ARK/guac/datasets/AILA_CIS/data/processed/', '/Users/dcard/Projects/CMU/ARK/guac/datasets/AILA_CIS/experiments/test_fold_0/blog/primary_test']
    print args
    project = args[0]
    old_exp_dir = args[1]
    new_processed_dir = args[2]
    new_splits_file = args[3]
    output_dir = args[4]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    dirs.set_project(project, new_splits_file)

    log_filename = os.path.join(old_exp_dir, 'log.json')
    model_file = os.path.join(old_exp_dir, 'model.json')

    test_fold = int(options.test_fold)
    new_name = options.new_name
    use_sentences = options.sentences
    alternate_label_file = options.alternate_label_file

    log = fh.read_json(log_filename)
    if new_name is None:
        new_name = log['name'] + '_rerun'

    log['name'] = new_name

    float_vars = ['best_alpha', 'alpha_exp_base', 'max_alpha_exp', 'min_alpha_exp', 'orig_T', 'tau']
    for v in float_vars:
        if v in log:
            if log[v] is not None:
                log[v] = float(log[v])
            else:
                log[v] = None

    target_col = int(log['target'])
    weight_col = int(log['weight_col'])
    label_file = log['label_file']


    # load the features
    print "Loading features"
    feature_list = log['feature_list']

    sentences = fh.read_json(os.path.join(new_processed_dir, '..', 'raw', 'text', 'sentences.json'))
    items = sentences.keys()
    print len(items), items[0]

    _, feature_names, X = experiment2.load_features(feature_list, items, stage='predict', data_processed_dir=new_processed_dir)

    """
    else:
        basename = os.path.split(fh.get_basename_wo_ext(label_file))[-1]
        index_filename = os.path.join(dirs.data_raw_index_dir, basename + '_sentence_index.json')

        # get the index which maps psueodocuments to real documents
        doc_index = fh.read_json(index_filename)
        items, target_name, labels, weights, _ = lr.get_labels(alternate_label_file, target_col, weight_col=weight_col)
        # add to our overall list of items, labels, and weights (for pseudodocuments)

        # load these featues
        _, feature_names, X = experiment.load_features(feature_list, test_fold, items, doc_index=doc_index,
                                                       stage='testing', data_processed_dir=dirs.data_processed_dir)
    """

    model = SparseModel()
    model.load(model_file)

    predictions = model.predict(X)
    print predictions.shape

    probs = model.predict_probs(X)
    print probs.shape

    df1 = pd.DataFrame(predictions, index=items, columns=['prediction'])
    df2 = pd.DataFrame(probs[:, 1], index=items, columns=['prob'])
    df = pd.concat([df1, df2], axis=1)

    output_file = os.path.join(output_dir, label_file + '.csv')
    df.to_csv(output_file)


    # need to refactor the following to make use of test data for the new data, but that's not even necessarily coherent
    """