def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
ngram_size=1, positional=False, stem_words=False) extractors = [ unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed, extract_brown_cluster, #extract_dependency_relation ] comp_feat_extactor = fact_composite_feature_extractor(extractors) code_freq = tally_code_frequencies(train_tagged_essays) folds = [(train_tagged_essays, test_tagged_essays)] for feat_poss_state in [False]: for feat_poss_transitions in [True]: for c2 in [1.0]: # different to CB cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) training_opt = { "feature.possible_states": feat_poss_state, "feature.possible_transitions": feat_poss_transitions, "c2": c2 }
unigram_bow_window = fact_extract_ngram_features(offset=offset, ngram_size=1, positional=False, stem_words=False) extractors = [ unigram_bow_window, unigram_window_stemmed, biigram_window_stemmed, #trigram_window_stemmed, extract_brown_cluster, #extract_dependency_relation ] comp_feat_extactor = fact_composite_feature_extractor(extractors) code_freq = tally_code_frequencies(train_tagged_essays) folds = [(train_tagged_essays, test_tagged_essays)] for feat_poss_state in [False]: for feat_poss_transitions in [True]: for c2 in [1.0]: # different to CB cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) training_opt = {"feature.possible_states" : feat_poss_state, "feature.possible_transitions": feat_poss_transitions, "c2": c2 } training_opt_copy = dict([(k.replace(".", "_"),v) for k,v in training_opt.items()])