def seq_conv_train(config): xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) xy_inference = Step(name='xy_inference', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta_valid')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) loader_train = Step(name='loader', transformer=MetadataImageSegmentationLoader(**config.loader), input_data=['input'], input_steps=[xy_train, xy_inference], adapter={'X': ([('xy_train', 'X')], squeeze_inputs), 'y': ([('xy_train', 'y')], squeeze_inputs), 'train_mode': ([('input', 'train_mode')]), 'X_valid': ([('xy_inference', 'X')], squeeze_inputs), 'y_valid': ([('xy_inference', 'y')], squeeze_inputs), }, cache_dirpath=config.env.cache_dirpath) sequential_convnet = Step(name='sequential_convnet', transformer=SequentialConvNet(**config.sequential_convnet), input_steps=[loader_train], cache_dirpath=config.env.cache_dirpath) mask_resize = Step(name='mask_resize', transformer=Resizer(), input_data=['input'], input_steps=[sequential_convnet], adapter={'images': ([('sequential_convnet', 'predicted_masks')]), 'target_sizes': ([('input', 'target_sizes')]), }, cache_dirpath=config.env.cache_dirpath) thresholding = Step(name='thresholding', transformer=Thresholder(**config.thresholder), input_steps=[mask_resize], adapter={'images': ([('mask_resize', 'resized_images')]), }, cache_dirpath=config.env.cache_dirpath) output = Step(name='output', transformer=Dummy(), input_steps=[thresholding], adapter={'y_pred': ([('thresholding', 'binarized_images')]), }, cache_dirpath=config.env.cache_dirpath) return output
def inference_preprocessing(config): xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner = Step(name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) cleaning_output = Step(name='cleaning_output', transformer=Dummy(), input_data=['input'], input_steps=[xy_train, text_cleaner], adapter={'X': ([('text_cleaner_train', 'X')]), 'y': ([('xy_train', 'y')]), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) return cleaning_output
def inference_preprocessing(config): fill_na_x = Step(name='fill_na_x', transformer=FillNA(**config.fill_na), input_data=['input'], adapter={'X': ([('input', 'meta')])}, cache_dirpath=config.env.cache_dirpath) xy_split = Step(name='xy_split', transformer=XYSplit(**config.xy_split), input_data=['input'], input_steps=[fill_na_x], adapter={ 'meta': ([('fill_na_x', 'X')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) return xy_split
def unet_inference(config): xy_inference = Step(name='xy_inference', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) loader_inference = Step(name='loader', transformer=MetadataImageSegmentationLoader(**config.loader), input_data=['input'], input_steps=[xy_inference, xy_inference], adapter={'X': ([('xy_inference', 'X')], squeeze_inputs), 'y': ([('xy_inference', 'y')], squeeze_inputs), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) unet_network = Step(name='unet_network', transformer=PyTorchUNet(**config.unet_network), input_steps=[loader_inference], cache_dirpath=config.env.cache_dirpath) mask_resize = Step(name='mask_resize', transformer=Resizer(), input_data=['input'], input_steps=[unet_network], adapter={'images': ([('unet_network', 'predicted_masks')]), 'target_sizes': ([('input', 'target_sizes')]), }, cache_dirpath=config.env.cache_dirpath) thresholding = Step(name='thresholding', transformer=Thresholder(**config.thresholder), input_steps=[mask_resize], adapter={'images': ([('mask_resize', 'resized_images')]), }, cache_dirpath=config.env.cache_dirpath) output = Step(name='output', transformer=Dummy(), input_steps=[thresholding], adapter={'y_pred': ([('thresholding', 'binarized_images')]), }, cache_dirpath=config.env.cache_dirpath) return output
def preprocessing_multitask_inference(config): if config.execution.load_in_memory: reader_inference = Step( name='reader_inference', transformer=ImageReader(**config.reader_multitask), input_data=['input'], adapter={ 'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) loader = Step( name='loader', transformer=ImageSegmentationMultitaskLoader(**config.loader), input_data=['input'], input_steps=[reader_inference], adapter={ 'X': ([('reader_inference', 'X')]), 'y': ([('reader_inference', 'y')]), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) else: xy_inference = Step(name='xy_inference', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={ 'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) loader = Step(name='loader', transformer=MetadataImageSegmentationMultitaskLoader( **config.loader), input_data=['input'], input_steps=[xy_inference, xy_inference], adapter={ 'X': ([('xy_inference', 'X')], squeeze_inputs), 'y': ([('xy_inference', 'y')], squeeze_inputs), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) return loader
def count_features(config): xy_split = Step(name='xy_split', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_counter = Step(name='text_counter', transformer=TextCounter(), input_steps=[xy_split], adapter={'X': ([('xy_split', 'X')])}, cache_dirpath=config.env.cache_dirpath) normalizer = Step(name='normalizer', transformer=Normalizer(), input_steps=[text_counter], adapter={'X': ([('text_counter', 'X')])}, cache_dirpath=config.env.cache_dirpath) return normalizer
def _preprocessing(config, is_train=True): if is_train: xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={ 'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner_train = Step( name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) xy_valid = Step(name='xy_valid', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={ 'meta': ([('input', 'meta_valid')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner_valid = Step( name='text_cleaner_valid', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_valid], adapter={'X': ([('xy_valid', 'X')])}, cache_dirpath=config.env.cache_dirpath) cleaning_output = Step(name='cleaning_output', transformer=Dummy(), input_data=['input'], input_steps=[ xy_train, text_cleaner_train, xy_valid, text_cleaner_valid ], adapter={ 'X': ([('text_cleaner_train', 'X')]), 'y': ([('xy_train', 'y')]), 'train_mode': ([('input', 'train_mode')]), 'X_valid': ([('text_cleaner_valid', 'X')]), 'y_valid': ([('xy_valid', 'y')]), }, cache_dirpath=config.env.cache_dirpath) else: xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input'], adapter={ 'meta': ([('input', 'meta')]), 'train_mode': ([('input', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner = Step(name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) cleaning_output = Step(name='cleaning_output', transformer=Dummy(), input_data=['input'], input_steps=[xy_train, text_cleaner], adapter={ 'X': ([('text_cleaner_train', 'X')]), 'y': ([('xy_train', 'y')]), 'train_mode': ([('input', 'train_mode')]), }, cache_dirpath=config.env.cache_dirpath) return cleaning_output
def ensemble_extraction(config): xy_train = Step(name='xy_train', transformer=XYSplit(**config.xy_splitter), input_data=['input_ensemble'], adapter={'meta': ([('input_ensemble', 'meta')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) text_cleaner_train = Step(name='text_cleaner_train', transformer=TextCleaner(**config.text_cleaner), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[text_cleaner_train], input_data=['input_ensemble'], adapter={'X': ([('text_cleaner_train', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step(name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step(name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[text_cleaner_train], adapter={'text': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_filter = Step(name='bad_word_filter', transformer=WordListFilter(**config.bad_word_filter), input_steps=[text_cleaner_train], adapter={'X': ([('text_cleaner_train', 'X')]), }, cache_dirpath=config.env.cache_dirpath) bad_word_tfidf_word_vectorizer = Step(name='bad_word_tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[bad_word_filter], adapter={'text': ([('bad_word_filter', 'X')]), }, cache_dirpath=config.env.cache_dirpath) text_counter = Step(name='text_counter', transformer=TextCounter(), input_steps=[xy_train], adapter={'X': ([('xy_train', 'X')])}, cache_dirpath=config.env.cache_dirpath) normalizer = Step(name='normalizer', transformer=Normalizer(), input_steps=[text_counter], adapter={'X': ([('text_counter', 'X')])}, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step(name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) logreg_count = Step(name='logreg_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer], adapter={'X': ([('normalizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word = Step(name='logreg_bad_word', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, bad_word_tfidf_word_vectorizer], adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_bad_word_count = Step(name='logreg_bad_word_count', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, normalizer, bad_word_tfidf_word_vectorizer], adapter={'X': ([('normalizer', 'X'), ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) logreg_tfidf = Step(name='logreg_tfidf', transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel), input_steps=[xy_train, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_train], adapter={'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_train], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step(name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_train, glove_embeddings], adapter={'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_train', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [logreg_count, logreg_bad_word, logreg_bad_word_count, logreg_tfidf, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn]
def ensemble_extraction(config): fill_na_x = Step(name='fill_na_x', transformer=FillNA(**config.fill_na), input_data=['input_ensemble'], adapter={'X': ([('input_ensemble', 'meta')])}, cache_dirpath=config.env.cache_dirpath) xy_split = Step(name='xy_split', transformer=XYSplit(**config.xy_split), input_data=['input_ensemble'], input_steps=[fill_na_x], adapter={ 'meta': ([('fill_na_x', 'X')]), 'train_mode': ([('input_ensemble', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) char_tokenizer = Step(name='char_tokenizer', transformer=Tokenizer(**config.char_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) word_tokenizer = Step(name='word_tokenizer', transformer=Tokenizer(**config.word_tokenizer), input_steps=[xy_split], adapter={ 'X': ([('xy_split', 'X')], fetch_x_train), 'train_mode': ([('xy_split', 'train_mode')]) }, cache_dirpath=config.env.cache_dirpath) tfidf_char_vectorizer = Step( name='tfidf_char_vectorizer', transformer=TfidfVectorizer(**config.tfidf_char_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) tfidf_word_vectorizer = Step( name='tfidf_word_vectorizer', transformer=TfidfVectorizer(**config.tfidf_word_vectorizer), input_steps=[xy_split], adapter={ 'text': ([('xy_split', 'X')], fetch_x_train), }, cache_dirpath=config.env.cache_dirpath) glove_embeddings = Step( name='glove_embeddings', transformer=GloveEmbeddingsMatrix(**config.glove_embeddings), input_steps=[word_tokenizer], adapter={ 'tokenizer': ([('word_tokenizer', 'tokenizer')]), }, cache_dirpath=config.env.cache_dirpath) log_reg_multi = Step( name='log_reg_multi', transformer=LogisticRegressionMultilabel( **config.logistic_regression_multilabel), input_steps=[xy_split, tfidf_char_vectorizer, tfidf_word_vectorizer], adapter={ 'X': ([('tfidf_char_vectorizer', 'features'), ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) char_vdcnn = Step(name='char_vdcnn', transformer=CharVDCNN(**config.char_vdcnn_network), input_steps=[char_tokenizer, xy_split], adapter={ 'X': ([('char_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) word_lstm = Step(name='word_lstm', transformer=WordLSTM(**config.word_lstm_network), input_steps=[word_tokenizer, xy_split], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_lstm = Step(name='glove_lstm', transformer=GloveLSTM(**config.glove_lstm_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_scnn = Step(name='glove_scnn', transformer=GloveSCNN(**config.glove_scnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) glove_dpcnn = Step( name='glove_dpcnn', transformer=GloveDPCNN(**config.glove_dpcnn_network), input_steps=[word_tokenizer, xy_split, glove_embeddings], adapter={ 'X': ([('word_tokenizer', 'X')]), 'y': ([('xy_split', 'y')]), 'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]), }, cache_dirpath=config.env.cache_dirpath, cache_output=True) return [ log_reg_multi, char_vdcnn, word_lstm, glove_lstm, glove_scnn, glove_dpcnn ]