def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict( list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(config.items() + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ SUFFIX = "_FEAT_SELECTION" CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
# For mongo extractor_names = map(lambda fn: fn.func_name, extractors) print("Extractors\n\t" + "\n\t".join(extractor_names)) feat_config = dict(train_config.items() + [("extractors", extractors)]) """ LOAD DATA """ train_tagged_essays = load_process_essays(**train_config) test_config = dict(train_config.items()) test_config["folder"] = test_folder test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded- Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # most params below exist ONLY for the purposes of the hashing to and from disk train_essay_feats = extract_features(train_tagged_essays, **feat_config) test_essay_feats = extract_features(test_tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats) all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit())) tag_freq = Counter(all_regular_tags ) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags # tags to evaluate against
def main(): args = parse_args() data = load_data('data/adult.data') test_data = load_data('data/adult.test2') val_data = load_data('data/adult.val') if args.depth_plot: print('Calculating f1-scores for different depths...') depths, scores = dt.tune_max_depth(data, val_data) plt.plot(depths, scores) plt.ylabel('F1-score') plt.xlabel('Maximum Depth') plt.show() quit() baseline_tree = dt.build_decision_tree( data, max_depth=1, forced_attribute=args.baseline_attribute) print('Building decision tree...') dt_start = time.time() if args.depth is not None: tree = dt.build_decision_tree(data, max_depth=args.depth) else: tree = dt.build_decision_tree(data) print('Decision tree built in ' + str(time.time() - dt_start) + ' s.') baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data, [baseline_tree]) dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree]) if args.rep: print('Pruning decision tree (reduced error)...') dtre_start = time.time() dt.reduced_error_prune(tree, val_data) print('Decision tree pruned (reduced error) in ' + str(time.time() - dtre_start) + ' s.') dtre_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree]) elif args.csp: print('Pruning decision tree (chi-square)...') dtcs_start = time.time() dt.chi_square_prune(tree) print('Decision tree pruned (chi-square) in ' + str(time.time() - dtcs_start) + ' s.') dtcs_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree]) y_train = get_labels(data) y_test = get_labels(test_data) features = extract_features(data, test_data) X_train = features[0] X_test = features[1] feature_names = features[2] print('Building logistic regression model...') lr_start = time.time() lr_model = LogisticRegression(solver='sag').fit(X_train, y_train) print('Logistic regression model built in ' + str(time.time() - lr_start) + ' s.') if args.lr_top is not None: print('Top weighted features in logistic regression model: ' + str(get_lr_top_weights(lr_model, args.lr_top, feature_names)[0])) if args.lr_bot is not None: print( 'Top negatively weighted features in logistic regression model: ' + str(get_lr_top_weights(lr_model, args.lr_bot, feature_names)[1])) lr_pred = lr_model.predict(X_test) weights = perceptron.perceptron(X_train, y_train, 10) perceptron_pred = perceptron.perceptron_test(X_test, weights) perceptron_metrics = [ y_test[i] == perceptron_pred[i] for i in range(len(y_test)) ].count(True) / len(test_data), precision_score( y_test, perceptron_pred), recall_score(y_test, perceptron_pred), f1_score( y_test, perceptron_pred) lr_metrics = [y_test[i] == lr_pred[i] for i in range(len(y_test)) ].count(True) / len(test_data), precision_score( y_test, lr_pred), recall_score(y_test, lr_pred), f1_score( y_test, lr_pred) print('Baseline:') print('Accuracy: ' + str(baseline_metrics[0])) print('Precision: ' + str(baseline_metrics[1])) print('Recall: ' + str(baseline_metrics[2])) print('F1 Score: ' + str(baseline_metrics[3])) print('\nDecision Tree:') print('Accuracy: ' + str(dt_metrics[0])) print('Precision: ' + str(dt_metrics[1])) print('Recall: ' + str(dt_metrics[2])) print('F1 Score: ' + str(dt_metrics[3])) if args.rep: print('\nDecision Tree (w/ reduced error pruning):') print('Accuracy: ' + str(dtre_metrics[0])) print('Precision: ' + str(dtre_metrics[1])) print('Recall: ' + str(dtre_metrics[2])) print('F1 Score: ' + str(dtre_metrics[3])) elif args.csp: print('\nDecision Tree (w/ chi-square pruning):') print('Accuracy: ' + str(dtcs_metrics[0])) print('Precision: ' + str(dtcs_metrics[1])) print('Recall: ' + str(dtcs_metrics[2])) print('F1 Score: ' + str(dtcs_metrics[3])) print('\nPerceptron:') print('Accuracy: ' + str(perceptron_metrics[0])) print('Precision: ' + str(perceptron_metrics[1])) print('Recall: ' + str(perceptron_metrics[2])) print('F1 Score: ' + str(perceptron_metrics[3])) print('\nLogistic Regression:') print('Accuracy: ' + str(lr_metrics[0])) print('Precision: ' + str(lr_metrics[1])) print('Recall: ' + str(lr_metrics[2])) print('F1 Score: ' + str(lr_metrics[3])) if args.plot: metrics_baseline = (baseline_metrics[0], baseline_metrics[1], baseline_metrics[2], baseline_metrics[3]) metrics_dt = (dt_metrics[0], dt_metrics[1], dt_metrics[2], dt_metrics[3]) metrics_perceptron = (perceptron_metrics[0], perceptron_metrics[1], perceptron_metrics[2], perceptron_metrics[3]) metrics_lr = (lr_metrics[0], lr_metrics[1], lr_metrics[2], lr_metrics[3]) metrics_dtre, metrics_dtcs = None, None if args.rep: metrics_dtre = (dtre_metrics[0], dtre_metrics[1], dtre_metrics[2], dtre_metrics[3]) elif args.csp: metrics_dtcs = (dtcs_metrics[0], dtcs_metrics[1], dtcs_metrics[2], dtcs_metrics[3]) plot_metrics(metrics_baseline, metrics_dt, metrics_perceptron, metrics_lr, metrics_dtre, metrics_dtcs)
def main(): data = load_data('data/adult.data') baseline_tree = dt.build_decision_tree(data, max_depth=1) print('Building decision tree...') dt_start = time.time() tree = dt.build_decision_tree(data) print('Decision tree built in ' + str(time.time() - dt_start) + ' s.') test_data = load_data('data/adult.val') baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data, [baseline_tree]) dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree]) y_train = get_labels(data) y_test = get_labels(test_data) features = extract_features(data, test_data) X_train = features[0] X_test = features[1] print('Building logistic regression model...') lr_start = time.time() lr_model = build_lr_model(X_train, y_train) print('Logistic regression model built in ' + str(time.time() - lr_start) + ' s.') lr_pred = lr_model.predict(X_test) #perceptron weights = perceptron.perceptron(X_train, y_train, 6) perceptron_pred=perceptron.perceptron_test(X_test,weights) #skilearn model's perceptron perceptron_ski = build_perceptron_ski(X_train, y_train) y_percep_pred = perceptron_ski.predict(X_test) ''' Result: Accuracy: 0.8032061912658928 Precision: 0.5655369538587178 Recall: 0.7202288091523661 F1 Score: 0.6335773101555352 ''' # Gaussian Naive Bayes naive_bayes_model = build_naive_bayes(X_train, y_train) y_naive_bayes_pred = naive_bayes_model.predict(X_test) ''' Result: Accuracy: 0.48473680977826916 Precision: 0.3092619027626165 Recall: 0.9576183047321893 F1 Score: 0.4675341161536021 ''' print('Baseline:') print('Accuracy: ' + str(baseline_metrics[0])) print('Precision: ' + str(baseline_metrics[1])) print('Recall: ' + str(baseline_metrics[2])) print('F1 Score: ' + str(baseline_metrics[3])) print('\nDecision Tree:') print('Accuracy: ' + str(dt_metrics[0])) print('Precision: ' + str(dt_metrics[1])) print('Recall: ' + str(dt_metrics[2])) print('F1 Score: ' + str(dt_metrics[3])) print('\nLogistic Regression:') print('Accuracy: ' + str([y_test[i] == lr_pred[i] for i in range(len(y_test))].count(True) / len(test_data))) print('Precision: ' + str(precision_score(y_test, lr_pred))) print('Recall: ' + str(recall_score(y_test, lr_pred))) print('F1 Score: ' + str(f1_score(y_test, lr_pred))) print('\nPerceptron Regression:') print('Accuracy: ' + str([y_test[i] == perceptron_pred[i] for i in range(len(y_test))].count(True) / len(test_data))) print('Precision: ' + str(precision_score(y_test, perceptron_pred))) print('Recall: ' + str(recall_score(y_test, perceptron_pred))) print('F1 Score: ' + str(f1_score(y_test, perceptron_pred))) print('\nPerceptron Regression (ski):') print('Accuracy: ' + str([y_test[i] == y_percep_pred[i] for i in range(len(y_test))].count(True) / len(test_data))) print('Precision: ' + str(precision_score(y_test, y_percep_pred))) print('Recall: ' + str(recall_score(y_test, y_percep_pred))) print('F1 Score: ' + str(f1_score(y_test, y_percep_pred))) print('\nNaive Bayes (ski):') print('Accuracy: ' + str([y_test[i] == y_naive_bayes_pred[i] for i in range(len(y_test))].count(True) / len(test_data))) print('Precision: ' + str(precision_score(y_test, y_naive_bayes_pred))) print('Recall: ' + str(recall_score(y_test, y_naive_bayes_pred))) print('F1 Score: ' + str(f1_score(y_test, y_naive_bayes_pred))) print("\nCross Validation")
def evaluate_feature_set(config, existing_extractors): feat_extractors = existing_extractors feat_config = dict( list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor"))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code( td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code( td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code( vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [ train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds ] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ # SUFFIX = "_FEAT_SELECTION" # CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX # parameters = dict(config) # parameters["extractors"] = list(map(lambda fn: fn.func_name, feat_extractors)) # parameters["min_feat_freq"] = MIN_FEAT_FREQ # # wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, # cv_wd_td_predictions_by_tag, parameters, wd_algo) # wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, # cv_wd_vd_predictions_by_tag, parameters, wd_algo) # avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return 0
extractor_names = map(lambda fn: fn.func_name, extractors) print("Extractors\n\t" + "\n\t".join(extractor_names)) feat_config = dict(train_config.items() + [("extractors", extractors)]) """ LOAD DATA """ train_tagged_essays = load_process_essays(**train_config) test_config = dict(train_config.items()) test_config["folder"] = test_folder test_tagged_essays = load_process_essays(**test_config) logger.info("Essays loaded- Train: %i Test %i" % (len(train_tagged_essays), len(test_tagged_essays))) # most params below exist ONLY for the purposes of the hashing to and from disk train_essay_feats = extract_features(train_tagged_essays, **feat_config) test_essay_feats = extract_features(test_tagged_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats) all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit())) tag_freq = Counter(all_regular_tags) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags # tags to evaluate against folds = [(train_essay_feats, test_essay_feats)] """ CLASSIFIERS """
True ]: # Don't replace if there is one or more real noun phrases in the reference updated_essays = get_processed_essays( tagged_essays, coref_files, max_mention_len=max_mention_len, max_reference_len=max_reference_len, must_not_have_noun_phrase=must_not_have_noun_phrase) """ LOAD DATA """ assert len(updated_essays) == len( tagged_essays ), "Must be same number of essays after processing" print(len(updated_essays), "updated essays") # most params below exist ONLY for the purposes of the hashing to and from disk train_essay_feats = extract_features(updated_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats) all_regular_tags = list( (t for t in flatten(lst_all_tags) if t[0].isdigit())) tag_freq = Counter(all_regular_tags) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags # tags to evaluate against folds = cross_validation(train_essay_feats, CV_FOLDS) """ CLASSIFIERS """
# for max_reference_len in [1, 2, 3, 5, 10, 100]: # for max_mention_len in [1, 2, 3, 5, 10, 100]: for max_reference_len in [0]: for max_mention_len in [0]: # for must_not_have_noun_phrase in [True, False]: # Don't replace if there is one or more real noun phrases in the reference for must_not_have_noun_phrase in [True]: # Don't replace if there is one or more real noun phrases in the reference updated_essays = replace_corefs_with_mentions(tagged_essays, coref_files, max_mention_len=max_mention_len, max_reference_len=max_reference_len, must_not_have_noun_phrase=must_not_have_noun_phrase) """ LOAD DATA """ assert len(updated_essays) == len(tagged_essays), "Must be same number of essays after processing" print(len(updated_essays), "updated essays") # most params below exist ONLY for the purposes of the hashing to and from disk train_essay_feats = extract_features(updated_essays, **feat_config) logger.info("Features loaded") """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(train_essay_feats) all_regular_tags = list((t for t in flatten(lst_all_tags) if t[0].isdigit())) tag_freq = Counter(all_regular_tags ) regular_tags = list(tag_freq.keys()) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags # tags to evaluate against folds = cross_validation(train_essay_feats, CV_FOLDS)