def char_vdcnn_train(config): preprocessed_input = train_preprocessing(config) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[preprocessed_input], adapter={'X': ([('cleaning_output', 'X')]), 'X_valid': ([('cleaning_output', 'X_valid')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) network = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), overwrite_transformer=True, input_steps=[char_tokenizer, preprocessed_input], adapter={'X': ([('char_tokenizer', 'X')]), 'y': ([('cleaning_output', 'y')]), 'validation_data': ( [('char_tokenizer', 'X_valid'), ('cleaning_output', 'y_valid')], to_tuple_inputs), }, cache_dirpath=config.env.cache_dirpath) char_output = Step(name='char_output', transformer=Dummy(), input_steps=[network], adapter={'y_pred': ([('char_vdcnn', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return char_output
def word_lstm_train(config): preprocessed_input = train_preprocessing(config) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={'X': ([('cleaning_output', 'X')]), 'X_valid': ([('cleaning_output', 'X_valid')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), overwrite_transformer=True, input_steps=[word_tokenizer, preprocessed_input], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('cleaning_output', 'y')]), 'validation_data': ( [('word_tokenizer', 'X_valid'), ('cleaning_output', 'y_valid')], to_tuple_inputs), }, cache_dirpath=config.env.cache_dirpath) word_output = Step(name='word_output', transformer=Dummy(), input_steps=[word_lstm], adapter={'y_pred': ([('word_lstm', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return word_output
def word_lstm_inference(config): preprocessed_input = inference_preprocessing(config) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={ 'X': ([('cleaning_output', 'X')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, preprocessed_input], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('cleaning_output', 'y')]), }, cache_dirpath=config.env.cache_dirpath) word_output = Step(name='word_output', transformer=Dummy(), input_steps=[word_lstm], adapter={ 'y_pred': ([('word_lstm', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return word_output
def char_vdcnn_inference(config): preprocessed_input = inference_preprocessing(config) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[preprocessed_input], adapter={ 'X': ([('cleaning_output', 'X')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) network = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, preprocessed_input], adapter={ 'X': ([('char_tokenizer', 'X')]), 'y': ([('cleaning_output', 'y')]), }, cache_dirpath=config.env.cache_dirpath) char_output = Step(name='char_output', transformer=Dummy(), input_steps=[network], adapter={ 'y_pred': ([('char_vdcnn', 'prediction_probability')]), }, cache_dirpath=config.env.cache_dirpath) return char_output
def _word_tokenizer(preprocessed_input, config, is_train=True): if is_train: word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={ 'X': ([('cleaning_output', 'X')]), 'train_mode': ([('cleaning_output', 'train_mode')]), 'X_valid': ([('cleaning_output', 'X_valid')]) }, cache_dirpath=config.env.cache_dirpath) else: word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={ 'X': ([('cleaning_output', 'X')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) return word_tokenizer
def glove_preprocessing_inference(config, preprocessed_input): word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={'X': ([('cleaning_output', 'X')]), 'train_mode': ([('cleaning_output', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step(name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) return word_tokenizer, glove_embeddings
def glove_preprocessing_train(config, preprocessed_input): word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[preprocessed_input], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'X_valid': ([('xy_split', 'validation_data')], fetch_x_valid), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step( name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={ 'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) return word_tokenizer, glove_embeddings
def ensemble_extraction(config): xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input_ensemble'], adapter={'meta': ([('input_ensemble', 'meta')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner_train = Step(name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step(name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step(name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_filter = Step(name='bad_word_filter', transformer=WordListFilter(**config.bad_word_filter), input_steps=[text_cleaner_train], adapter={'X': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_tfidf_word_vectorizer = Step(name='bad_word_tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[bad_word_filter], adapter={'text': ([('bad_word_filter', 'X')]), }, cache_dirpath=config.env.cache_dirpath) text_counter = Step(name='text_counter', transformer=TextCounter(), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) normalizer = Step(name='normalizer', transformer=Normalizer(), input_steps=[text_counter], adapter={'X': ([('text_counter', 'X')])}, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step(name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) logreg_count = Step(name='logreg_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer], adapter={'X': ([('normalizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word = Step(name='logreg_bad_word', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, bad_word_tfidf_word_vectorizer], adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word_count = Step(name='logreg_bad_word_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer, bad_word_tfidf_word_vectorizer], adapter={'X': ([('normalizer', 'X'), ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_tfidf = Step(name='logreg_tfidf', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_train], adapter={'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_train], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step(name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [logreg_count, logreg_bad_word, logreg_bad_word_count, logreg_tfidf, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn]
def ensemble_extraction(config): fill_na_x = Step(name='fill_na_x', transformer=FillNA(**config.fill_na), input_data=['input_ensemble'], adapter={'X': ([('input_ensemble', 'meta')])}, cache_dirpath=config.env.cache_dirpath) xy_split = Step(name='xy_split', transformer=XYSplit(**config.xy_split), input_data=['input_ensemble'], input_steps=[fill_na_x], adapter={ 'meta': ([('fill_na_x', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step( name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step( name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step( name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={ 'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) log_reg_multi = Step( name='log_reg_multi', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[xy_split, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={ 'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_split], adapter={ 'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_split], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step( name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [ log_reg_multi, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn ]