def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict( list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def run(self, min_wd_cnt = 5, stem = True, spelling_correct = True, folds = 10): """ We don't want to remove stop words """ sentences, tagged_sentences = self.load_tagged_sentences() processed_sentences = self.process_sentences(sentences, spelling_correct, stem, min_wd_cnt) sentence_features = np.asarray( map(self.features_for_sentence, processed_sentences)) cross_validation_ixs = cross_validation(range(len(sentences)), folds) codes = sorted(set(flatten(tagged_sentences))) for code in codes: code_tags = self.tags_for_code(code, tagged_sentences) pass pass
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) return td_X.shape, vd_X.shape #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) td_col_sizes, vd_col_sizes = [], [] for (essays_TD, essays_VD) in folds: td_x_shape, vd_x_shape = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) td_col_sizes.append(td_x_shape[1]) vd_col_sizes.append(vd_x_shape[1]) return np.mean(td_col_sizes), np.mean(vd_col_sizes)
def run(self, min_wd_cnt=5, stem=True, spelling_correct=True, folds=10): """ We don't want to remove stop words """ sentences, tagged_sentences = self.load_tagged_sentences() processed_sentences = self.process_sentences(sentences, spelling_correct, stem, min_wd_cnt) sentence_features = np.asarray( map(self.features_for_sentence, processed_sentences)) cross_validation_ixs = cross_validation(range(len(sentences)), folds) codes = sorted(set(flatten(tagged_sentences))) for code in codes: code_tags = self.tags_for_code(code, tagged_sentences) pass pass
#assert set(CAUSE_TAGS).issubset(set(sent_input_feat_tags)), "To extract causal relations, we need Causer tags" # tags to evaluate against """ CLASSIFIERS """ """ Log Reg + Log Reg is best!!! """ f_output_file = open(out_predictions_file, "w+") f_output_file.write( "Essay|Sent Number|Processed Sentence|Concept Codes|Predictions\n") # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def pad_str(val): return str(val).ljust(20) + " " def toDict(obj): return obj.__dict__ #TODO Parallelize for i, (essays_TD, essays_VD) in enumerate(folds): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects
# Get Test Data In Order to Get Test CRELS # load the test essays to make sure we compute metrics over the test CR labels test_config = get_config(test_folder) tagged_essays_test = load_process_essays(**test_config) ######################################################## fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill" with open(fname, "rb") as f: pred_tagged_essays = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays)) # should be 902 cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays, tag_essays_test=tagged_essays_test) cv_folds = cross_validation(pred_tagged_essays, CV_FOLDS) # type: List[Tuple[Any,Any]] def evaluate_features( collection_prefix: str, folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], cost_function_name: str, beta: float, base_learner: Any, ngrams: int, stemmed: bool, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
extractors = [ unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed, extract_brown_cluster, #extract_dependency_relation ] comp_feat_extactor = fact_composite_feature_extractor(extractors) cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(tagged_essays, CV_FOLDS) results = Parallel(n_jobs=CV_FOLDS)( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results: wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed")
results_processor = ResultsProcessor(dbname="metrics_coref_causal") train_fname = coref_output_folder + "training_crel_anatagged_essays_most_recent_code.dill" with open(train_fname, "rb") as f: pred_tagged_essays_train = dill.load(f) test_fname = coref_output_folder + "test_crel_anatagged_essays_most_recent_code.dill" with open(test_fname, "rb") as f: pred_tagged_essays_test = dill.load(f) logger.info("Started at: " + str(datetime.datetime.now())) logger.info("Number of pred tagged essays %i" % len(pred_tagged_essays_train)) # should be 902 cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test) # cv_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)] # type: List[Tuple[Any,Any]] cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS) def evaluate_model( collection_prefix: str, folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], cost_function_name: str, beta: float, ngrams: int, stemmed: bool, max_epochs: int, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds):
def evaluate_feature_set(config, existing_extractors, new_extractor, features_filename_prefix): feat_extractors = existing_extractors + [new_extractor] feat_config = dict(config.items() + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit()))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression() # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ SUFFIX = "_FEAT_SELECTION" CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = map(lambda fn: fn.func_name, feat_extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
# for all other tags, a 0 for tag in (vtags - set([EMPTY_TAG, pred_tag])): pred_ys_by_tag[tag].append(0) if EMPTY_TAG in pred_ys_by_tag: del pred_ys_by_tag[EMPTY_TAG] return pred_ys_by_tag def train_dev_split(lst, dev_split): # random shuffle shuffle(lst) num_training = int((1.0 - dev_split) * len(lst)) return lst[:num_training], lst[num_training:] folds = cross_validation(tagged_essays, CV_FOLDS) fold2training_data = {} fold2dev_data = {} fold2test_data = {} for i, (essays_TD, essays_VD) in enumerate(folds): # further split into train and dev test essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT) fold2training_data[i] = get_training_data(essays_train) fold2dev_data[i] = get_training_data(essays_dev) # Test Data fold2test_data[i] = get_training_data(essays_VD) # ## Load Glove 100 Dim Embeddings # see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
from LoadData import loadTestData from LoadData import loadTrainData from Knn import Knn from CrossValidation import cross_validation import numpy as np import math trainData,trainLabel=loadTrainData("cifar-10-batches-py/") print(np.shape(trainData),np.shape(trainLabel)) testData,testLabel=loadTestData("cifar-10-batches-py/") print(np.shape(testData),np.shape(testLabel)) trainData=trainData[:100] trainLabel=trainLabel[:100] testData=testData[:10] testLabel=testLabel[:10] accuracy = cross_validation(trainData,trainLabel,4) print(accuracy)
pred_ys_by_tag[pred_tag].append(1) # for all other tags, a 0 for tag in (vtags - set([EMPTY_TAG, pred_tag])): pred_ys_by_tag[tag].append(0) if EMPTY_TAG in pred_ys_by_tag: del pred_ys_by_tag[EMPTY_TAG] return pred_ys_by_tag def train_dev_split(lst, dev_split): # random shuffle shuffle(lst) num_training = int((1.0 - dev_split) * len(lst)) return lst[:num_training], lst[num_training:] folds = cross_validation(tagged_essays, CV_FOLDS) fold2training_data = {} fold2dev_data = {} fold2test_data = {} for i, (essays_TD, essays_VD) in enumerate(folds): # further split into train and dev test essays_train, essays_dev = train_dev_split(essays_TD, DEV_SPLIT) fold2training_data[i] = get_training_data(essays_train) fold2dev_data[i] = get_training_data(essays_dev) # Test Data fold2test_data[i] = get_training_data(essays_VD) # ## Load Glove 100 Dim Embeddings # see /Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/DeepLearning/WordVectors/pickle_glove_embedding.py
stemmed = True cost_function_name = micro_f1_cost_plusepsilon.__name__ dual = True fit_intercept = True beta = 0.5 max_epochs = 2 C = 0.5 penalty = "l2" # Note these also differ for SC dataset BASE_LEARNER_FACT = lambda: LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept) best_extractor_names = ['single_words', 'between_word_features', 'label_set', 'three_words', 'third_order', 'unigrams'] # type: List[str] test_folds = [(pred_tagged_essays_train, pred_tagged_essays_test)] # type: List[Tuple[Any,Any]] cv_folds = cross_validation(pred_tagged_essays_train, CV_FOLDS) # type: List[Tuple[Any,Any]] result_test_essay_level = evaluate_model_essay_level( folds=cv_folds, extractor_fn_names_lst=best_extractor_names, all_extractor_fns=all_extractor_fns, ngrams=ngrams, beta=beta, stemmed=stemmed, down_sample_rate=1.0, max_epochs=max_epochs) models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_td_preds_by_sent, cv_sent_vd_ys_by_tag = result_test_essay_level mean_metrics = ResultsProcessor.compute_mean_metrics(cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag) print(get_micro_metrics(metrics_to_df(mean_metrics)))
for code in codes: print "Training for :", code cls = fn_create_cls() code2classifier[code] = cls ys = np.asarray(yByCode[code]) #ys = map(map_y, ys) cls.fit(xs, ys) return code2classifier fn_classifier = LinearSVC SPLITS = 2 causal_codes = cr_codes + ["explicit"] ixs = range(len(sentences)) folds = cross_validation(ixs, SPLITS) td_metrics = [] vd_metrics = [] for num, (ix_train, ix_valid) in enumerate(folds): print "Fold:", num + 1 # Train sequential classifier xs_t, yByCode_t = extract_xs_ys(ix_train, ix2xs, ix2ys, all_codes) code2cls = train(all_codes, xs_t, yByCode_t, fn_classifier) print "Training Sentence Classifier" # Extract new data points and target classes ix2xs_sent = to_sentence_level_predictions(ix2xs, code2cls) newxs_t, newyByCode_t = extract_xs_ys(ix_train, ix2xs_sent, ix2ys_sent, all_codes + causal_codes)
code2classifier = {} for code in codes: print "Training for :", code cls = fn_create_cls() code2classifier[code] = cls ys = np.asarray(yByCode[code]) #ys = map(map_y, ys) cls.fit(xs, ys) return code2classifier fn_classifier = LinearSVC SPLITS = 2 causal_codes = cr_codes + ["explicit"] ixs = range(len(sentences)) folds = cross_validation(ixs, SPLITS) td_metrics = [] vd_metrics = [] for num, (ix_train, ix_valid) in enumerate(folds): print "Fold:", num + 1 # Train sequential classifier xs_t, yByCode_t = extract_xs_ys(ix_train, ix2xs, ix2ys, all_codes) code2cls = train(all_codes, xs_t, yByCode_t, fn_classifier) print "Training Sentence Classifier" # Extract new data points and target classes ix2xs_sent = to_sentence_level_predictions(ix2xs, code2cls) newxs_t, newyByCode_t = extract_xs_ys(ix_train, ix2xs_sent, ix2ys_sent, all_codes + causal_codes)
wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ create_classifier = lambda : LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept) if fold == 0: print(create_classifier()) tag2word_classifier = train_classifier_per_code( td_X, wd_td_ys_bytag, create_classifier, wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag folds = cross_validation(essay_feats, CV_FOLDS) def evaluate_tagger(dual, C, penalty, fit_intercept): hyper_opt_params = locals() # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) """ This doesn't run in parallel ! Sequential operation takes exactly same duration """ # results = Parallel(n_jobs=CV_FOLDS, verbose=0, backend='multiprocessing')( # delayed(train_tagger)(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags, dual, C, penalty, fit_intercept) # for fold, (essays_TD, essays_VD) in enumerate(folds)) # # for result in results: # td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
logger.info("Essays loaded") len(tagged_essays) # Create Corpus in CRF Format (list of list of tuples(word,tag)) # -------------------------------------------------------------- tag_freq = get_tag_freq(tagged_essays) regular_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= 0 and tag[0].isdigit()))) """ FEATURE EXTRACTION """ config["window_size"] = 11 offset = (config["window_size"] - 1) / 2 cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = cross_validation(tagged_essays, CV_FOLDS) results = Parallel(n_jobs=CV_FOLDS)( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results: wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed")
hidden_layer_sizes=(10, 2), random_state=111) models = [forest, gdBoost, mlp] names = ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"] # Vars to select the best suited model bestModel = None bestName = "none" bestMean = 0.0 x_best = [] scenario = "none" # Using all the features print("------------------------------------------") print("------All Features -----------------------") model, name, mean = cross_validation(x_norm, y, models, names) if (mean > bestMean): bestModel, bestName, bestMean = model, name, mean x_best = x_norm scenario = "all" # Removing features with low variance print("------------------------------------------") print("------Removing features with low variance -----------------------") sel = VarianceThreshold(threshold=(0.01)) x_case = sel.fit_transform(x_norm) model, name, mean = cross_validation(x_case, y, models, names) if (mean > bestMean): bestModel, bestName, bestMean = model, name, mean x_best = x_case scenario = "variance"
for tag in tags: stag_freq[tag] += 1 # TODO - don't ignore Anaphor, other and rhetoricals here cr_tags = list((t for t in stag_freq.keys() if ("->" in t) and not "Anaphor" in t and not "other" in t and not "rhetorical" in t and not "factor" in t and 1 == 1)) regular_tags = set( (t for t in stag_freq.keys() if ("->" not in t) and (t[0].isdigit()))) #regular_tags = set((t for t in stag_freq.keys() if ( "->" not in t) and (t == "explicit" or t[0].isdigit()))) vtags = set(regular_tags) assert "explicit" not in vtags, "explicit should NOT be in the regular tags" cv_folds = cross_validation(pred_tagged_essays, CV_FOLDS) # type: List[Tuple[Any,Any]] def get_functions_by_name(function_names, functions): return [fn for fn in functions if fn.__name__ in function_names] def get_function_names(functions): return list(map(lambda fn: fn.__name__, functions)) def evaluate_features(folds: List[Tuple[Any, Any]], extractor_names: Set[str], cost_function_name: str, beta: float = 0.3, base_learner: Any = LogisticRegression,
def evaluate_feature_set(config, existing_extractors): feat_extractors = existing_extractors feat_config = dict( list(config.items()) + [("extractors", feat_extractors)]) """ LOAD FEATURES """ # most params below exist ONLY for the purposes of the hashing to and from disk #mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix, verbose=False)(extract_features) #essay_feats = mem_extract_features(tagged_essays, **feat_config) essay_feats = extract_features(tagged_essays, **feat_config) """ DEFINE TAGS """ _, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats) regular_tags = list( set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor"))) """ works best with all the pair-wise causal relation codes """ wd_train_tags = regular_tags wd_test_tags = regular_tags """ CLASSIFIERS """ fn_create_wd_cls = lambda: LogisticRegression( ) # C=1, dual = False seems optimal wd_algo = str(fn_create_wd_cls()) # Gather metrics per fold cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) folds = cross_validation(essay_feats, CV_FOLDS) def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags): # TD and VD are lists of Essay objects. The sentences are lists # of featureextractortransformer.Word objects """ Data Partitioning and Training """ td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD) vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD) feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS) td_X, vd_X = feature_transformer.fit_transform( td_feats), feature_transformer.transform(vd_feats) wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags) wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags) """ TRAIN Tagger """ tag2word_classifier = train_classifier_per_code( td_X, wd_td_ys_bytag, lambda: LogisticRegression(), wd_train_tags, verbose=False) """ TEST Tagger """ td_wd_predictions_by_code = test_classifier_per_code( td_X, tag2word_classifier, wd_test_tags) vd_wd_predictions_by_code = test_classifier_per_code( vd_X, tag2word_classifier, wd_test_tags) return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag #results = Parallel(n_jobs=CV_FOLDS)( # delayed(train_tagger)(essays_TD, essays_VD, wd_test_tags, wd_train_tags) # for (essays_TD, essays_VD) in folds) results = [ train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags) for (essays_TD, essays_VD) in folds ] for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) # print results for each code """ Persist Results to Mongo DB """ # SUFFIX = "_FEAT_SELECTION" # CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX # parameters = dict(config) # parameters["extractors"] = list(map(lambda fn: fn.func_name, feat_extractors)) # parameters["min_feat_freq"] = MIN_FEAT_FREQ # # wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, # cv_wd_td_predictions_by_tag, parameters, wd_algo) # wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, # cv_wd_vd_predictions_by_tag, parameters, wd_algo) # avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return 0