def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        training_opt = {"feature.possible_states": False,
                        "feature.possible_transitions": False,
                        "c2": 2.0
                        }
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Esempio n. 3
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt):

    # Start Training
    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Esempio n. 6
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Esempio n. 7
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            training_opt):

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "power_set", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor,
                      verbose=False,
                      training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)

    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        trainer = HiddenMarkovModelTrainer()
        model = trainer.train_supervised(td)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

logger.info("Training completed")

""" Persist Results to Mongo DB """

wd_algo = "HMM_BR"
Esempio n. 10
0
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        trainer = HiddenMarkovModelTrainer()
        model = trainer.train_supervised(td)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(
            td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(
            vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

logger.info("Training completed")
""" Persist Results to Mongo DB """