Beispiel #1
0
def evaluate_feature_set(config, existing_extractors, new_extractor,
                         features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(
        list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD,
                                              wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        return td_X.shape, vd_X.shape

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    td_col_sizes, vd_col_sizes = [], []
    for (essays_TD, essays_VD) in folds:
        td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        td_col_sizes.append(td_x_shape[1])
        vd_col_sizes.append(vd_x_shape[1])
    return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix):

    feat_extractors = existing_extractors + [new_extractor]
    feat_config = dict(config.items() + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(),
                                                        wd_train_tags, verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
               for (essays_TD, essays_VD) in folds]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    SUFFIX = "_FEAT_SELECTION"
    CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors)
    parameters["min_feat_freq"] = MIN_FEAT_FREQ

    wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag,
                                               cv_wd_td_predictions_by_tag, parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag,
                                               cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return avg_f1
# For mongo
extractor_names = map(lambda fn: fn.func_name, extractors)
print("Extractors\n\t" + "\n\t".join(extractor_names))

feat_config = dict(train_config.items() + [("extractors", extractors)])
""" LOAD DATA """
train_tagged_essays = load_process_essays(**train_config)

test_config = dict(train_config.items())
test_config["folder"] = test_folder

test_tagged_essays = load_process_essays(**test_config)
logger.info("Essays loaded- Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays)))

# most params below exist ONLY for the purposes of the hashing to and from disk
train_essay_feats = extract_features(train_tagged_essays, **feat_config)
test_essay_feats  = extract_features(test_tagged_essays,  **feat_config)
logger.info("Features loaded")

""" DEFINE TAGS """
_, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))

tag_freq = Counter(all_regular_tags )
regular_tags = list(tag_freq.keys())

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags  = regular_tags
# tags to evaluate against
Beispiel #5
0
def main():
    args = parse_args()

    data = load_data('data/adult.data')
    test_data = load_data('data/adult.test2')
    val_data = load_data('data/adult.val')

    if args.depth_plot:
        print('Calculating f1-scores for different depths...')
        depths, scores = dt.tune_max_depth(data, val_data)
        plt.plot(depths, scores)
        plt.ylabel('F1-score')
        plt.xlabel('Maximum Depth')
        plt.show()
        quit()

    baseline_tree = dt.build_decision_tree(
        data, max_depth=1, forced_attribute=args.baseline_attribute)
    print('Building decision tree...')
    dt_start = time.time()
    if args.depth is not None:
        tree = dt.build_decision_tree(data, max_depth=args.depth)
    else:
        tree = dt.build_decision_tree(data)

    print('Decision tree built in ' + str(time.time() - dt_start) + ' s.')

    baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [baseline_tree])
    dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree])

    if args.rep:
        print('Pruning decision tree (reduced error)...')
        dtre_start = time.time()
        dt.reduced_error_prune(tree, val_data)
        print('Decision tree pruned (reduced error) in ' +
              str(time.time() - dtre_start) + ' s.')
        dtre_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [tree])
    elif args.csp:
        print('Pruning decision tree (chi-square)...')
        dtcs_start = time.time()
        dt.chi_square_prune(tree)
        print('Decision tree pruned (chi-square) in ' +
              str(time.time() - dtcs_start) + ' s.')
        dtcs_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [tree])

    y_train = get_labels(data)
    y_test = get_labels(test_data)

    features = extract_features(data, test_data)
    X_train = features[0]
    X_test = features[1]
    feature_names = features[2]
    print('Building logistic regression model...')
    lr_start = time.time()
    lr_model = LogisticRegression(solver='sag').fit(X_train, y_train)

    print('Logistic regression model built in ' + str(time.time() - lr_start) +
          ' s.')

    if args.lr_top is not None:
        print('Top weighted features in logistic regression model: ' +
              str(get_lr_top_weights(lr_model, args.lr_top, feature_names)[0]))
    if args.lr_bot is not None:
        print(
            'Top negatively weighted features in logistic regression model: ' +
            str(get_lr_top_weights(lr_model, args.lr_bot, feature_names)[1]))

    lr_pred = lr_model.predict(X_test)

    weights = perceptron.perceptron(X_train, y_train, 10)
    perceptron_pred = perceptron.perceptron_test(X_test, weights)

    perceptron_metrics = [
        y_test[i] == perceptron_pred[i] for i in range(len(y_test))
    ].count(True) / len(test_data), precision_score(
        y_test, perceptron_pred), recall_score(y_test,
                                               perceptron_pred), f1_score(
                                                   y_test, perceptron_pred)
    lr_metrics = [y_test[i] == lr_pred[i] for i in range(len(y_test))
                  ].count(True) / len(test_data), precision_score(
                      y_test, lr_pred), recall_score(y_test,
                                                     lr_pred), f1_score(
                                                         y_test, lr_pred)

    print('Baseline:')
    print('Accuracy: ' + str(baseline_metrics[0]))
    print('Precision: ' + str(baseline_metrics[1]))
    print('Recall: ' + str(baseline_metrics[2]))
    print('F1 Score: ' + str(baseline_metrics[3]))

    print('\nDecision Tree:')
    print('Accuracy: ' + str(dt_metrics[0]))
    print('Precision: ' + str(dt_metrics[1]))
    print('Recall: ' + str(dt_metrics[2]))
    print('F1 Score: ' + str(dt_metrics[3]))

    if args.rep:
        print('\nDecision Tree (w/ reduced error pruning):')
        print('Accuracy: ' + str(dtre_metrics[0]))
        print('Precision: ' + str(dtre_metrics[1]))
        print('Recall: ' + str(dtre_metrics[2]))
        print('F1 Score: ' + str(dtre_metrics[3]))
    elif args.csp:
        print('\nDecision Tree (w/ chi-square pruning):')
        print('Accuracy: ' + str(dtcs_metrics[0]))
        print('Precision: ' + str(dtcs_metrics[1]))
        print('Recall: ' + str(dtcs_metrics[2]))
        print('F1 Score: ' + str(dtcs_metrics[3]))

    print('\nPerceptron:')
    print('Accuracy: ' + str(perceptron_metrics[0]))
    print('Precision: ' + str(perceptron_metrics[1]))
    print('Recall: ' + str(perceptron_metrics[2]))
    print('F1 Score: ' + str(perceptron_metrics[3]))

    print('\nLogistic Regression:')
    print('Accuracy: ' + str(lr_metrics[0]))
    print('Precision: ' + str(lr_metrics[1]))
    print('Recall: ' + str(lr_metrics[2]))
    print('F1 Score: ' + str(lr_metrics[3]))

    if args.plot:
        metrics_baseline = (baseline_metrics[0], baseline_metrics[1],
                            baseline_metrics[2], baseline_metrics[3])
        metrics_dt = (dt_metrics[0], dt_metrics[1], dt_metrics[2],
                      dt_metrics[3])
        metrics_perceptron = (perceptron_metrics[0], perceptron_metrics[1],
                              perceptron_metrics[2], perceptron_metrics[3])
        metrics_lr = (lr_metrics[0], lr_metrics[1], lr_metrics[2],
                      lr_metrics[3])
        metrics_dtre, metrics_dtcs = None, None
        if args.rep:
            metrics_dtre = (dtre_metrics[0], dtre_metrics[1], dtre_metrics[2],
                            dtre_metrics[3])
        elif args.csp:
            metrics_dtcs = (dtcs_metrics[0], dtcs_metrics[1], dtcs_metrics[2],
                            dtcs_metrics[3])
        plot_metrics(metrics_baseline, metrics_dt, metrics_perceptron,
                     metrics_lr, metrics_dtre, metrics_dtcs)
def main():
    data = load_data('data/adult.data')
    baseline_tree = dt.build_decision_tree(data, max_depth=1)
    print('Building decision tree...')
    dt_start = time.time()
    tree = dt.build_decision_tree(data)
    print('Decision tree built in ' + str(time.time() - dt_start) + ' s.')

    test_data = load_data('data/adult.val')
    baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data, [baseline_tree])
    dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree])
    
    y_train = get_labels(data)
    y_test = get_labels(test_data)

    features = extract_features(data, test_data)
    X_train = features[0]
    X_test = features[1]

    print('Building logistic regression model...')
    lr_start = time.time()
    lr_model = build_lr_model(X_train, y_train)
    print('Logistic regression model built in ' + str(time.time() - lr_start) + ' s.')

    lr_pred = lr_model.predict(X_test)

    #perceptron
    weights = perceptron.perceptron(X_train, y_train, 6)
    perceptron_pred=perceptron.perceptron_test(X_test,weights)

    #skilearn model's perceptron
    perceptron_ski = build_perceptron_ski(X_train, y_train)
    y_percep_pred = perceptron_ski.predict(X_test)
    '''
    Result:
    Accuracy: 0.8032061912658928
    Precision: 0.5655369538587178
    Recall: 0.7202288091523661
    F1 Score: 0.6335773101555352
    '''

    # Gaussian Naive Bayes
    naive_bayes_model = build_naive_bayes(X_train, y_train)
    y_naive_bayes_pred = naive_bayes_model.predict(X_test)

    '''
    Result:
    Accuracy: 0.48473680977826916
    Precision: 0.3092619027626165
    Recall: 0.9576183047321893
    F1 Score: 0.4675341161536021
    '''


    print('Baseline:')
    print('Accuracy: ' + str(baseline_metrics[0]))
    print('Precision: ' + str(baseline_metrics[1]))
    print('Recall: ' + str(baseline_metrics[2]))
    print('F1 Score: ' + str(baseline_metrics[3]))
    
    print('\nDecision Tree:')
    print('Accuracy: ' + str(dt_metrics[0]))
    print('Precision: ' + str(dt_metrics[1]))
    print('Recall: ' + str(dt_metrics[2]))
    print('F1 Score: ' + str(dt_metrics[3]))

    print('\nLogistic Regression:')
    print('Accuracy: ' + str([y_test[i] == lr_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, lr_pred)))
    print('Recall: ' + str(recall_score(y_test, lr_pred)))
    print('F1 Score: ' + str(f1_score(y_test, lr_pred)))

    print('\nPerceptron Regression:')
    print('Accuracy: ' + str([y_test[i] == perceptron_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, perceptron_pred)))
    print('Recall: ' + str(recall_score(y_test, perceptron_pred)))
    print('F1 Score: ' + str(f1_score(y_test, perceptron_pred)))

    print('\nPerceptron Regression (ski):')
    print('Accuracy: ' + str([y_test[i] == y_percep_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, y_percep_pred)))
    print('Recall: ' + str(recall_score(y_test, y_percep_pred)))
    print('F1 Score: ' + str(f1_score(y_test, y_percep_pred)))

    print('\nNaive Bayes (ski):')
    print('Accuracy: ' + str([y_test[i] == y_naive_bayes_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, y_naive_bayes_pred)))
    print('Recall: ' + str(recall_score(y_test, y_naive_bayes_pred)))
    print('F1 Score: ' + str(f1_score(y_test, y_naive_bayes_pred)))

    print("\nCross Validation")
def evaluate_feature_set(config, existing_extractors):

    feat_extractors = existing_extractors
    feat_config = dict(
        list(config.items()) + [("extractors", feat_extractors)])
    """ LOAD FEATURES """
    # most params below exist ONLY for the purposes of the hashing to and from disk
    #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features)
    #essay_feats = mem_extract_features(tagged_essays, **feat_config)
    essay_feats = extract_features(tagged_essays, **feat_config)
    """ DEFINE TAGS """
    _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
    regular_tags = list(
        set((t for t in flatten(lst_all_tags)
             if t.lower().strip() == "anaphor")))
    """ works best with all the pair-wise causal relation codes """
    wd_train_tags = regular_tags
    wd_test_tags = regular_tags
    """ CLASSIFIERS """
    fn_create_wd_cls = lambda: LogisticRegression(
    )  # C=1, dual = False seems optimal
    wd_algo = str(fn_create_wd_cls())

    # Gather metrics per fold
    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    folds = cross_validation(essay_feats, CV_FOLDS)

    def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
        # TD and VD are lists of Essay objects. The sentences are lists
        # of featureextractortransformer.Word objects
        """ Data Partitioning and Training """
        td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
        vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
        feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ,
                                                sparse=SPARSE_WD_FEATS)
        td_X, vd_X = feature_transformer.fit_transform(
            td_feats), feature_transformer.transform(vd_feats)
        wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
        wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
        """ TRAIN Tagger """
        tag2word_classifier = train_classifier_per_code(
            td_X,
            wd_td_ys_bytag,
            lambda: LogisticRegression(),
            wd_train_tags,
            verbose=False)
        """ TEST Tagger """
        td_wd_predictions_by_code = test_classifier_per_code(
            td_X, tag2word_classifier, wd_test_tags)
        vd_wd_predictions_by_code = test_classifier_per_code(
            vd_X, tag2word_classifier, wd_test_tags)
        return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

    #results = Parallel(n_jobs=CV_FOLDS)(
    #        delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
    #            for (essays_TD, essays_VD) in folds)

    results = [
        train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)
        for (essays_TD, essays_VD) in folds
    ]

    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code,
                           cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code,
                           cv_wd_vd_predictions_by_tag)

    # print results for each code
    """ Persist Results to Mongo DB """
    # SUFFIX = "_FEAT_SELECTION"
    # CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
    # parameters = dict(config)
    # parameters["extractors"] = list(map(lambda fn: fn.func_name, feat_extractors))
    # parameters["min_feat_freq"] = MIN_FEAT_FREQ
    #
    # wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag,
    #                                            cv_wd_td_predictions_by_tag, parameters, wd_algo)
    # wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag,
    #                                            cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    # avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return 0
extractor_names = map(lambda fn: fn.func_name, extractors)
print("Extractors\n\t" + "\n\t".join(extractor_names))

feat_config = dict(train_config.items() + [("extractors", extractors)])
""" LOAD DATA """
train_tagged_essays = load_process_essays(**train_config)

test_config = dict(train_config.items())
test_config["folder"] = test_folder

test_tagged_essays = load_process_essays(**test_config)
logger.info("Essays loaded- Train: %i Test %i" %
            (len(train_tagged_essays), len(test_tagged_essays)))

# most params below exist ONLY for the purposes of the hashing to and from disk
train_essay_feats = extract_features(train_tagged_essays, **feat_config)
test_essay_feats = extract_features(test_tagged_essays, **feat_config)
logger.info("Features loaded")
""" DEFINE TAGS """
_, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))

tag_freq = Counter(all_regular_tags)
regular_tags = list(tag_freq.keys())
""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags = regular_tags
# tags to evaluate against

folds = [(train_essay_feats, test_essay_feats)]
""" CLASSIFIERS """
Beispiel #9
0
                True
        ]:  # Don't replace if there is one or more real noun phrases in the reference
            updated_essays = get_processed_essays(
                tagged_essays,
                coref_files,
                max_mention_len=max_mention_len,
                max_reference_len=max_reference_len,
                must_not_have_noun_phrase=must_not_have_noun_phrase)
            """ LOAD DATA """
            assert len(updated_essays) == len(
                tagged_essays
            ), "Must be same number of essays after processing"
            print(len(updated_essays), "updated essays")

            # most params below exist ONLY for the purposes of the hashing to and from disk
            train_essay_feats = extract_features(updated_essays, **feat_config)
            logger.info("Features loaded")
            """ DEFINE TAGS """
            _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
            all_regular_tags = list(
                (t for t in flatten(lst_all_tags) if t[0].isdigit()))

            tag_freq = Counter(all_regular_tags)
            regular_tags = list(tag_freq.keys())
            """ works best with all the pair-wise causal relation codes """
            wd_train_tags = regular_tags
            wd_test_tags = regular_tags
            # tags to evaluate against

            folds = cross_validation(train_essay_feats, CV_FOLDS)
            """ CLASSIFIERS """
# for max_reference_len in [1, 2, 3, 5, 10, 100]:
#     for max_mention_len in [1, 2, 3, 5, 10, 100]:
for max_reference_len in [0]:
    for max_mention_len in [0]:
        # for must_not_have_noun_phrase in [True, False]: # Don't replace if there is one or more real noun phrases in the reference
        for must_not_have_noun_phrase in [True]: # Don't replace if there is one or more real noun phrases in the reference
            updated_essays = replace_corefs_with_mentions(tagged_essays, coref_files,
                                                          max_mention_len=max_mention_len, max_reference_len=max_reference_len,
                                                          must_not_have_noun_phrase=must_not_have_noun_phrase)
            """ LOAD DATA """
            assert len(updated_essays) == len(tagged_essays), "Must be same number of essays after processing"
            print(len(updated_essays), "updated essays")

            # most params below exist ONLY for the purposes of the hashing to and from disk
            train_essay_feats = extract_features(updated_essays, **feat_config)
            logger.info("Features loaded")

            """ DEFINE TAGS """
            _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats)
            all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit()))

            tag_freq = Counter(all_regular_tags )
            regular_tags = list(tag_freq.keys())

            """ works best with all the pair-wise causal relation codes """
            wd_train_tags = regular_tags
            wd_test_tags  = regular_tags
            # tags to evaluate against

            folds = cross_validation(train_essay_feats, CV_FOLDS)