def train_model_best(is_training=False, model_name='S2-gb', classifier=lr): model_best = stst.Model(model_name, classifier) model_best.add(TFFeature(type='word', convey='count', load=True)) model_best.add(TFFeature(type='char', convey='count', load=True)) model_best.add(BigramFeature(type='word', convey='count', load=True)) model_best.add(BigramFeature(type='char', convey='count', load=True)) emb_wd_50_file = config.EMB_WORD_DIR + '/embedding.50' emb_wd_100_file = config.EMB_WORD_DIR + '/embedding.100' emb_wd_200_file = config.EMB_WORD_DIR + '/embedding.200' emb_wd_300_file = config.EMB_WORD_DIR + '/embedding.300' headlines_vec = config.EMB_WORD_DIR + '/headlines.vec' model_best.add( MinAvgMaxEmbeddingFeature('headlines', 100, headlines_vec, pooling_type='avg', load=True)) model_best.add( MinAvgMaxEmbeddingFeature('headlines', 100, headlines_vec, pooling_type='all', load=True)) if is_training: train_nlpcc(model_best) dev_nlpcc(model_best) return model_best
def stack(): model_stack = stst.Model('Stack1', boosting) model_stack.add(model_best_stack) model_stack.add(xgb_model_best_stack) model_stack.add(model_emb_stack) train_nlpcc(model_stack) model_stack.feature_list = [] model_stack.add(model_best) model_stack.add(model_emb) model_stack.add(xgb_model_best) dev_nlpcc(model_stack)
word_count += len(words) char_count += len(chars) print(word_count / len(instances)) print(char_count / len(instances)) # Define Model lr = stst.Classifier(stst.LIB_LINEAR_LR()) svm = stst.Classifier(stst.skLearn_svm()) xgb = stst.Classifier(stst.XGBOOST_prob()) boosting = stst.Classifier(stst.sklearn_GradientBoosting()) model = stst.Model('S-lr-expand', lr) # model.add(FuckFeature(load=True)) model.add(TFFeature(type='word', convey='count', load=False)) model.add(TFFeature(type='char', convey='count', load=False)) model.add(BigramFeature(type='word', convey='count', load=False)) model.add(BigramFeature(type='char', convey='count', load=False)) headlines_vec = config.EMB_WORD_DIR + '/headlines.vec' model.add( MinAvgMaxEmbeddingFeature('headlines', 100, headlines_vec, pooling_type='avg', load=False))