def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs):

    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, max_epochs=max_epochs)

    num_feats = template_feature_extractor.num_features()

    essay_td_ys_bycode = parse_model.get_label_data_essay_level(essays_TD)
    essay_vd_ys_bycode = parse_model.get_label_data_essay_level(essays_VD)

    essay_td_pred_ys_bycode = parse_model.predict_essay_level(essays_TD)
    essay_vd_pred_ys_bycode = parse_model.predict_essay_level(essays_VD)

    return num_feats, essay_td_ys_bycode, essay_vd_ys_bycode, essay_td_pred_ys_bycode, essay_vd_pred_ys_bycode
Ejemplo n.º 2
0
def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta):
    extractors = get_functions_by_name(extractor_names, all_extractor_fns)
    # get single cost function
    cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0]
    assert cost_fn is not None, "Cost function look up failed"
    # Ensure all extractors located
    assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names"

    template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors)
    if stemmed:
        ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams)
    else:
        ngram_extractor = NgramExtractor(max_ngram_len=ngrams)
    parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor, cr_tags=cr_tags,
                                             base_learner_fact=BASE_LEARNER_FACT,
                                             beta=beta,
                                             # log_fn=lambda s: print(s))
                                             log_fn=lambda s: None)

    parse_model.train(essays_TD, MAX_EPOCHS)

    num_feats = template_feature_extractor.num_features()

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    return num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode
for i, (essays_TD, essays_VD) in enumerate(folds):
    print("\nCV % i" % i)
    # parse_model = SearnModel(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.1)
    # parse_model = SearnModelXgBoost(feat_extractor, cr_tags, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelCla(feat_extractor, cr_tags, base_learner_fact=CostSensitiveLogisticRegression, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags,
    #                                        base_learner_fact=lambda : RandomForestClassifier(n_jobs=1, max_depth=25), beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags,
    #                                        base_learner_fact=lambda : GradientBoostingClassifier(max_depth=3, max_features="log2"),
    #                                        beta_decay_fn=lambda beta: beta - 0.3, sparse=False)

    parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor,
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor,
                                             cr_tags=cr_tags,
                                             base_learner_fact=LogisticRegression,
                                             beta=BETA
                                             )
    parse_model.train(essays_TD, MAX_EPOCHS)

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
    merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
    merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
    merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
for i, (essays_TD, essays_VD) in enumerate(folds):
    print("\nCV % i" % i)
    # parse_model = SearnModel(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.1)
    # parse_model = SearnModelXgBoost(feat_extractor, cr_tags, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelCla(feat_extractor, cr_tags, base_learner_fact=CostSensitiveLogisticRegression, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags,
    #                                        base_learner_fact=lambda : RandomForestClassifier(n_jobs=1, max_depth=25), beta_decay_fn=lambda beta: beta - 0.3)
    # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags,
    #                                        base_learner_fact=lambda : GradientBoostingClassifier(max_depth=3, max_features="log2"),
    #                                        beta_decay_fn=lambda beta: beta - 0.3, sparse=False)

    parse_model = SearnModelTemplateFeatures(
        feature_extractor=template_feature_extractor,
        cost_function=cost_fn,
        min_feature_freq=MIN_FEAT_FREQ,
        ngram_extractor=ngram_extractor,
        cr_tags=cr_tags,
        base_learner_fact=LogisticRegression,
        beta=BETA)
    parse_model.train(essays_TD, MAX_EPOCHS)

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
    merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
    merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
    merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)