def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta, max_epochs): extractors = get_functions_by_name(extractor_names, all_extractor_fns) # get single cost function cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0] assert cost_fn is not None, "Cost function look up failed" # Ensure all extractors located assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names" template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors) if stemmed: ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams) else: ngram_extractor = NgramExtractor(max_ngram_len=ngrams) parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor, cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=BASE_LEARNER_FACT, beta=beta, # log_fn=lambda s: print(s)) log_fn=lambda s: None) parse_model.train(essays_TD, max_epochs=max_epochs) num_feats = template_feature_extractor.num_features() essay_td_ys_bycode = parse_model.get_label_data_essay_level(essays_TD) essay_vd_ys_bycode = parse_model.get_label_data_essay_level(essays_VD) essay_td_pred_ys_bycode = parse_model.predict_essay_level(essays_TD) essay_vd_pred_ys_bycode = parse_model.predict_essay_level(essays_VD) return num_feats, essay_td_ys_bycode, essay_vd_ys_bycode, essay_td_pred_ys_bycode, essay_vd_pred_ys_bycode
def model_train_predict(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, stemmed, beta): extractors = get_functions_by_name(extractor_names, all_extractor_fns) # get single cost function cost_fn = get_functions_by_name([cost_function_name], all_cost_functions)[0] assert cost_fn is not None, "Cost function look up failed" # Ensure all extractors located assert len(extractors) == len(extractor_names), "number of extractor functions does not match the number of names" template_feature_extractor = NonLocalTemplateFeatureExtractor(extractors=extractors) if stemmed: ngram_extractor = NgramExtractorStemmed(max_ngram_len=ngrams) else: ngram_extractor = NgramExtractor(max_ngram_len=ngrams) parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor, cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=BASE_LEARNER_FACT, beta=beta, # log_fn=lambda s: print(s)) log_fn=lambda s: None) parse_model.train(essays_TD, MAX_EPOCHS) num_feats = template_feature_extractor.num_features() sent_td_ys_bycode = parse_model.get_label_data(essays_TD) sent_vd_ys_bycode = parse_model.get_label_data(essays_VD) sent_td_pred_ys_bycode = parse_model.predict(essays_TD) sent_vd_pred_ys_bycode = parse_model.predict(essays_VD) return num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode
for i, (essays_TD, essays_VD) in enumerate(folds): print("\nCV % i" % i) # parse_model = SearnModel(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.1) # parse_model = SearnModelXgBoost(feat_extractor, cr_tags, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelCla(feat_extractor, cr_tags, base_learner_fact=CostSensitiveLogisticRegression, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, # base_learner_fact=lambda : RandomForestClassifier(n_jobs=1, max_depth=25), beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, # base_learner_fact=lambda : GradientBoostingClassifier(max_depth=3, max_features="log2"), # beta_decay_fn=lambda beta: beta - 0.3, sparse=False) parse_model = SearnModelTemplateFeatures(feature_extractor=template_feature_extractor, cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=LogisticRegression, beta=BETA ) parse_model.train(essays_TD, MAX_EPOCHS) sent_td_ys_bycode = parse_model.get_label_data(essays_TD) sent_vd_ys_bycode = parse_model.get_label_data(essays_VD) sent_td_pred_ys_bycode = parse_model.predict(essays_TD) sent_vd_pred_ys_bycode = parse_model.predict(essays_VD) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
for i, (essays_TD, essays_VD) in enumerate(folds): print("\nCV % i" % i) # parse_model = SearnModel(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.1) # parse_model = SearnModelXgBoost(feat_extractor, cr_tags, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelCla(feat_extractor, cr_tags, base_learner_fact=CostSensitiveLogisticRegression, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, base_learner_fact=LogisticRegression, beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, # base_learner_fact=lambda : RandomForestClassifier(n_jobs=1, max_depth=25), beta_decay_fn=lambda beta: beta - 0.3) # parse_model = SearnModelSklearnWeighted(feat_extractor, cr_tags, # base_learner_fact=lambda : GradientBoostingClassifier(max_depth=3, max_features="log2"), # beta_decay_fn=lambda beta: beta - 0.3, sparse=False) parse_model = SearnModelTemplateFeatures( feature_extractor=template_feature_extractor, cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=LogisticRegression, beta=BETA) parse_model.train(essays_TD, MAX_EPOCHS) sent_td_ys_bycode = parse_model.get_label_data(essays_TD) sent_vd_ys_bycode = parse_model.get_label_data(essays_VD) sent_td_pred_ys_bycode = parse_model.predict(essays_TD) sent_vd_pred_ys_bycode = parse_model.predict(essays_VD) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)