def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html training_opt = {"feature.possible_states": False, "feature.possible_transitions": False, "c2": 2.0 } model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt): # Start Training # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, training_opt): # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "power_set", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed") """ Persist Results to Mongo DB """ wd_algo = "HMM_BR"
wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td) code2model[code] = model wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) td_wd_predictions_by_code[code] = to_flattened_binary_tags( td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags( vd_predictions) merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed") """ Persist Results to Mongo DB """