def tfidf_logreg(config): preprocessed_input = _preprocessing(config, is_train=False) tfidf_char_vectorizer, tfidf_word_vectorizer = _tfidf( preprocessed_input, config) tfidf_logreg = Step(name='tfidf_logreg', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[ preprocessed_input, tfidf_char_vectorizer, tfidf_word_vectorizer ], adapter={ 'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('cleaning_output', 'y')]), }, cache_dirpath=config.env.cache_dirpath) output = Step(name='tfidf_logreg_output', transformer=Dummy(), input_steps=[tfidf_logreg], adapter={ 'y_pred': ([('tfidf_logreg', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return output
def bad_word_count_features_logreg(config): preprocessed_input = _preprocessing(config, is_train=False) normalizer = _count_features(config) xy_split = normalizer.get_step('xy_split') tfidf_word_vectorizer = _bad_word_tfidf(preprocessed_input, config) bad_word_count_logreg = Step( name='bad_word_count_logreg', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[xy_split, normalizer, tfidf_word_vectorizer], adapter={ 'X': ([('normalizer', 'X'), ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath) output = Step(name='bad_word_count_features_logreg_output', transformer=Dummy(), input_steps=[bad_word_count_logreg], adapter={ 'y_pred': ([('bad_word_count_logreg', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return output
def hand_crafted_all_logreg(config): xy_split, normalizer, char_vector, word_vector, bad_word_vector = hand_crafted_all( config) logreg_multi = Step(name='logreg_multi', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[ xy_split, normalizer, char_vector, word_vector, bad_word_vector ], adapter={ 'X': ([('normalizer', 'X'), ('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features'), ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath) logreg_output = Step(name='logreg_output', transformer=Dummy(), input_steps=[logreg_multi], adapter={ 'y_pred': ([('logreg_multi', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return logreg_output
def logistic_regression_ensemble_train(config): model_outputs = ensemble_extraction(config) output_mappings = [(output_step.name, 'prediction_probability') for output_step in model_outputs] label = model_outputs[0].get_step('xy_train') input_steps = model_outputs + [label] logreg = Step(name='logreg_ensemble', transformer=LogisticRegressionMultilabel( **config.logistic_regression_ensemble), overwrite_transformer=True, input_steps=input_steps, adapter={ 'X': (output_mappings, hstack_inputs), 'y': ([('xy_train', 'y')]) }, cache_dirpath=config.env.cache_dirpath) logreg_ensemble_output = Step( name='logreg_ensemble_output', transformer=Dummy(), input_steps=[logreg], adapter={'y_pred': ([('logreg_ensemble', 'prediction_probability')])}, cache_dirpath=config.env.cache_dirpath) return logreg_ensemble_output
def bad_word_logreg(config): preprocessed_input = inference_preprocessing(config) tfidf_word_vectorizer = bad_word_tfidf(preprocessed_input, config) logreg_bad_word = Step(name='logreg_bad_word', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[preprocessed_input, tfidf_word_vectorizer], adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]), 'y': ([('cleaning_output', 'y')]), }, cache_dirpath=config.env.cache_dirpath) logreg_output = Step(name='logreg_output', transformer=Dummy(), input_steps=[logreg_bad_word], adapter={'y_pred': ([('logreg_bad_word', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return logreg_output
def count_features_logreg(config): normalizer = count_features(config) xy_split = normalizer.get_step('xy_split') logreg_count = Step(name='logreg_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_split, normalizer], adapter={'X': ([('normalizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath) logreg_output = Step(name='logreg_output', transformer=Dummy(), input_steps=[logreg_count], adapter={'y_pred': ([('logreg_count', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return logreg_output
def tfidf_log_reg(preprocessed_input, config): tfidf_char_vectorizer = Step( name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[preprocessed_input], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step( name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[preprocessed_input], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) log_reg_multi = Step(name='log_reg_multi', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[ preprocessed_input, tfidf_char_vectorizer, tfidf_word_vectorizer ], adapter={ 'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath) log_reg_output = Step(name='log_reg_output', transformer=Dummy(), input_steps=[log_reg_multi], adapter={ 'y_pred': ([('log_reg_multi', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return log_reg_output
def ensemble_extraction(config): xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input_ensemble'], adapter={'meta': ([('input_ensemble', 'meta')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner_train = Step(name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step(name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step(name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_filter = Step(name='bad_word_filter', transformer=WordListFilter(**config.bad_word_filter), input_steps=[text_cleaner_train], adapter={'X': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_tfidf_word_vectorizer = Step(name='bad_word_tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[bad_word_filter], adapter={'text': ([('bad_word_filter', 'X')]), }, cache_dirpath=config.env.cache_dirpath) text_counter = Step(name='text_counter', transformer=TextCounter(), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) normalizer = Step(name='normalizer', transformer=Normalizer(), input_steps=[text_counter], adapter={'X': ([('text_counter', 'X')])}, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step(name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) logreg_count = Step(name='logreg_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer], adapter={'X': ([('normalizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word = Step(name='logreg_bad_word', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, bad_word_tfidf_word_vectorizer], adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word_count = Step(name='logreg_bad_word_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer, bad_word_tfidf_word_vectorizer], adapter={'X': ([('normalizer', 'X'), ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_tfidf = Step(name='logreg_tfidf', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_train], adapter={'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_train], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step(name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [logreg_count, logreg_bad_word, logreg_bad_word_count, logreg_tfidf, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn]
def ensemble_extraction(config): fill_na_x = Step(name='fill_na_x', transformer=FillNA(**config.fill_na), input_data=['input_ensemble'], adapter={'X': ([('input_ensemble', 'meta')])}, cache_dirpath=config.env.cache_dirpath) xy_split = Step(name='xy_split', transformer=XYSplit(**config.xy_split), input_data=['input_ensemble'], input_steps=[fill_na_x], adapter={ 'meta': ([('fill_na_x', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step( name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step( name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step( name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={ 'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) log_reg_multi = Step( name='log_reg_multi', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[xy_split, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={ 'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_split], adapter={ 'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_split], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step( name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [ log_reg_multi, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn ]