def char_vdcnn_train(config):
    preprocessed_input = train_preprocessing(config)
    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={'X': ([('cleaning_output', 'X')]),
                                   'X_valid': ([('cleaning_output', 'X_valid')]),
                                   'train_mode': ([('cleaning_output', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)
    network = Step(name='char_vdcnn',
                   transformer=CharVDCNN(**config.char_vdcnn_network),
                   overwrite_transformer=True,
                   input_steps=[char_tokenizer, preprocessed_input],
                   adapter={'X': ([('char_tokenizer', 'X')]),
                            'y': ([('cleaning_output', 'y')]),
                            'validation_data': (
                                [('char_tokenizer', 'X_valid'), ('cleaning_output', 'y_valid')], to_tuple_inputs),
                            },
                   cache_dirpath=config.env.cache_dirpath)
    char_output = Step(name='char_output',
                       transformer=Dummy(),
                       input_steps=[network],
                       adapter={'y_pred': ([('char_vdcnn', 'prediction_probability')]),
                                },
                       cache_dirpath=config.env.cache_dirpath)
    return char_output
def word_lstm_train(config):
    preprocessed_input = train_preprocessing(config)
    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={'X': ([('cleaning_output', 'X')]),
                                   'X_valid': ([('cleaning_output', 'X_valid')]),
                                   'train_mode': ([('cleaning_output', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)

    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     overwrite_transformer=True,
                     input_steps=[word_tokenizer, preprocessed_input],
                     adapter={'X': ([('word_tokenizer', 'X')]),
                              'y': ([('cleaning_output', 'y')]),
                              'validation_data': (
                                  [('word_tokenizer', 'X_valid'), ('cleaning_output', 'y_valid')], to_tuple_inputs),
                              },
                     cache_dirpath=config.env.cache_dirpath)
    word_output = Step(name='word_output',
                       transformer=Dummy(),
                       input_steps=[word_lstm],
                       adapter={'y_pred': ([('word_lstm', 'prediction_probability')]),
                                },
                       cache_dirpath=config.env.cache_dirpath)
    return word_output
Exemple #3
0
def word_lstm_inference(config):
    preprocessed_input = inference_preprocessing(config)
    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={
                              'X': ([('cleaning_output', 'X')]),
                              'train_mode':
                              ([('cleaning_output', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)
    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     input_steps=[word_tokenizer, preprocessed_input],
                     adapter={
                         'X': ([('word_tokenizer', 'X')]),
                         'y': ([('cleaning_output', 'y')]),
                     },
                     cache_dirpath=config.env.cache_dirpath)
    word_output = Step(name='word_output',
                       transformer=Dummy(),
                       input_steps=[word_lstm],
                       adapter={
                           'y_pred':
                           ([('word_lstm', 'prediction_probability')]),
                       },
                       cache_dirpath=config.env.cache_dirpath)
    return word_output
Exemple #4
0
def char_vdcnn_inference(config):
    preprocessed_input = inference_preprocessing(config)
    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={
                              'X': ([('cleaning_output', 'X')]),
                              'train_mode':
                              ([('cleaning_output', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)
    network = Step(name='char_vdcnn',
                   transformer=CharVDCNN(**config.char_vdcnn_network),
                   input_steps=[char_tokenizer, preprocessed_input],
                   adapter={
                       'X': ([('char_tokenizer', 'X')]),
                       'y': ([('cleaning_output', 'y')]),
                   },
                   cache_dirpath=config.env.cache_dirpath)
    char_output = Step(name='char_output',
                       transformer=Dummy(),
                       input_steps=[network],
                       adapter={
                           'y_pred':
                           ([('char_vdcnn', 'prediction_probability')]),
                       },
                       cache_dirpath=config.env.cache_dirpath)
    return char_output
Exemple #5
0
def _word_tokenizer(preprocessed_input, config, is_train=True):
    if is_train:
        word_tokenizer = Step(name='word_tokenizer',
                              transformer=Tokenizer(**config.word_tokenizer),
                              input_steps=[preprocessed_input],
                              adapter={
                                  'X': ([('cleaning_output', 'X')]),
                                  'train_mode':
                                  ([('cleaning_output', 'train_mode')]),
                                  'X_valid': ([('cleaning_output', 'X_valid')])
                              },
                              cache_dirpath=config.env.cache_dirpath)
    else:
        word_tokenizer = Step(name='word_tokenizer',
                              transformer=Tokenizer(**config.word_tokenizer),
                              input_steps=[preprocessed_input],
                              adapter={
                                  'X': ([('cleaning_output', 'X')]),
                                  'train_mode':
                                  ([('cleaning_output', 'train_mode')])
                              },
                              cache_dirpath=config.env.cache_dirpath)
    return word_tokenizer
def glove_preprocessing_inference(config, preprocessed_input):
    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={'X': ([('cleaning_output', 'X')]),
                                   'train_mode': ([('cleaning_output', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)
    glove_embeddings = Step(name='glove_embeddings',
                            transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
                            input_steps=[word_tokenizer],
                            adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]),
                                     },
                            cache_dirpath=config.env.cache_dirpath)
    return word_tokenizer, glove_embeddings
def glove_preprocessing_train(config, preprocessed_input):
    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[preprocessed_input],
                          adapter={
                              'X': ([('xy_split', 'X')], fetch_x_train),
                              'X_valid': ([('xy_split', 'validation_data')],
                                          fetch_x_valid),
                              'train_mode': ([('xy_split', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)
    glove_embeddings = Step(
        name='glove_embeddings',
        transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
        input_steps=[word_tokenizer],
        adapter={
            'tokenizer': ([('word_tokenizer', 'tokenizer')]),
        },
        cache_dirpath=config.env.cache_dirpath)
    return word_tokenizer, glove_embeddings
def ensemble_extraction(config):
    xy_train = Step(name='xy_train',
                    transformer=XYSplit(**config.xy_splitter),
                    input_data=['input_ensemble'],
                    adapter={'meta': ([('input_ensemble', 'meta')]),
                             'train_mode': ([('input_ensemble', 'train_mode')])
                             },
                    cache_dirpath=config.env.cache_dirpath)
    text_cleaner_train = Step(name='text_cleaner_train',
                              transformer=TextCleaner(**config.text_cleaner),
                              input_steps=[xy_train],
                              adapter={'X': ([('xy_train', 'X')])},
                              cache_dirpath=config.env.cache_dirpath)

    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[text_cleaner_train],
                          input_data=['input_ensemble'],
                          adapter={'X': ([('text_cleaner_train', 'X')]),
                                   'train_mode': ([('input_ensemble', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)

    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[text_cleaner_train],
                          input_data=['input_ensemble'],
                          adapter={'X': ([('text_cleaner_train', 'X')]),
                                   'train_mode': ([('input_ensemble', 'train_mode')])
                                   },
                          cache_dirpath=config.env.cache_dirpath)

    tfidf_char_vectorizer = Step(name='tfidf_char_vectorizer',
                                 transformer=TfidfVectorizer(**config.tfidf_char_vectorizer),
                                 input_steps=[text_cleaner_train],
                                 adapter={'text': ([('text_cleaner_train', 'X')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath)
    tfidf_word_vectorizer = Step(name='tfidf_word_vectorizer',
                                 transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
                                 input_steps=[text_cleaner_train],
                                 adapter={'text': ([('text_cleaner_train', 'X')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath)

    bad_word_filter = Step(name='bad_word_filter',
                           transformer=WordListFilter(**config.bad_word_filter),
                           input_steps=[text_cleaner_train],
                           adapter={'X': ([('text_cleaner_train', 'X')]),
                                    },
                           cache_dirpath=config.env.cache_dirpath)

    bad_word_tfidf_word_vectorizer = Step(name='bad_word_tfidf_word_vectorizer',
                                          transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
                                          input_steps=[bad_word_filter],
                                          adapter={'text': ([('bad_word_filter', 'X')]),
                                                   },
                                          cache_dirpath=config.env.cache_dirpath)

    text_counter = Step(name='text_counter',
                        transformer=TextCounter(),
                        input_steps=[xy_train],
                        adapter={'X': ([('xy_train', 'X')])},
                        cache_dirpath=config.env.cache_dirpath)

    normalizer = Step(name='normalizer',
                      transformer=Normalizer(),
                      input_steps=[text_counter],
                      adapter={'X': ([('text_counter', 'X')])},
                      cache_dirpath=config.env.cache_dirpath)

    glove_embeddings = Step(name='glove_embeddings',
                            transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
                            input_steps=[word_tokenizer],
                            adapter={'tokenizer': ([('word_tokenizer', 'tokenizer')]),
                                     },
                            cache_dirpath=config.env.cache_dirpath)

    logreg_count = Step(name='logreg_count',
                        transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                        input_steps=[xy_train, normalizer],
                        adapter={'X': ([('normalizer', 'X')]),
                                 'y': ([('xy_train', 'y')]),
                                 },
                        cache_dirpath=config.env.cache_dirpath,
                        cache_output=True)
    logreg_bad_word = Step(name='logreg_bad_word',
                           transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                           input_steps=[xy_train, bad_word_tfidf_word_vectorizer],
                           adapter={'X': ([('bad_word_tfidf_word_vectorizer', 'features')]),
                                    'y': ([('xy_train', 'y')]),
                                    },
                           cache_dirpath=config.env.cache_dirpath,
                           cache_output=True)
    logreg_bad_word_count = Step(name='logreg_bad_word_count',
                                 transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                                 input_steps=[xy_train, normalizer, bad_word_tfidf_word_vectorizer],
                                 adapter={'X': ([('normalizer', 'X'),
                                                 ('bad_word_tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
                                          'y': ([('xy_train', 'y')]),
                                          },
                                 cache_dirpath=config.env.cache_dirpath,
                                 cache_output=True)
    logreg_tfidf = Step(name='logreg_tfidf',
                        transformer=LogisticRegressionMultilabel(**config.logistic_regression_multilabel),
                        input_steps=[xy_train, tfidf_char_vectorizer, tfidf_word_vectorizer],
                        adapter={'X': ([('tfidf_char_vectorizer', 'features'),
                                        ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
                                 'y': ([('xy_train', 'y')]),
                                 },
                        cache_dirpath=config.env.cache_dirpath,
                        cache_output=True)
    char_vdcnn = Step(name='char_vdcnn',
                      transformer=CharVDCNN(**config.char_vdcnn_network),
                      input_steps=[char_tokenizer, xy_train],
                      adapter={'X': ([('char_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     input_steps=[word_tokenizer, xy_train],
                     adapter={'X': ([('word_tokenizer', 'X')]),
                              'y': ([('xy_train', 'y')]),
                              },
                     cache_dirpath=config.env.cache_dirpath,
                     cache_output=True)
    glove_lstm = Step(name='glove_lstm',
                      transformer=GloveLSTM(**config.glove_lstm_network),
                      input_steps=[word_tokenizer, xy_train, glove_embeddings],
                      adapter={'X': ([('word_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_scnn = Step(name='glove_scnn',
                      transformer=GloveSCNN(**config.glove_scnn_network),
                      input_steps=[word_tokenizer, xy_train, glove_embeddings],
                      adapter={'X': ([('word_tokenizer', 'X')]),
                               'y': ([('xy_train', 'y')]),
                               'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                               },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_dpcnn = Step(name='glove_dpcnn',
                       transformer=GloveDPCNN(**config.glove_dpcnn_network),
                       input_steps=[word_tokenizer, xy_train, glove_embeddings],
                       adapter={'X': ([('word_tokenizer', 'X')]),
                                'y': ([('xy_train', 'y')]),
                                'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
                                },
                       cache_dirpath=config.env.cache_dirpath,
                       cache_output=True)

    return [logreg_count, logreg_bad_word, logreg_bad_word_count,
            logreg_tfidf, char_vdcnn, word_lstm, glove_lstm,
            glove_scnn, glove_dpcnn]
def ensemble_extraction(config):
    fill_na_x = Step(name='fill_na_x',
                     transformer=FillNA(**config.fill_na),
                     input_data=['input_ensemble'],
                     adapter={'X': ([('input_ensemble', 'meta')])},
                     cache_dirpath=config.env.cache_dirpath)
    xy_split = Step(name='xy_split',
                    transformer=XYSplit(**config.xy_split),
                    input_data=['input_ensemble'],
                    input_steps=[fill_na_x],
                    adapter={
                        'meta': ([('fill_na_x', 'X')]),
                        'train_mode': ([('input_ensemble', 'train_mode')])
                    },
                    cache_dirpath=config.env.cache_dirpath)

    char_tokenizer = Step(name='char_tokenizer',
                          transformer=Tokenizer(**config.char_tokenizer),
                          input_steps=[xy_split],
                          adapter={
                              'X': ([('xy_split', 'X')], fetch_x_train),
                              'train_mode': ([('xy_split', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)

    word_tokenizer = Step(name='word_tokenizer',
                          transformer=Tokenizer(**config.word_tokenizer),
                          input_steps=[xy_split],
                          adapter={
                              'X': ([('xy_split', 'X')], fetch_x_train),
                              'train_mode': ([('xy_split', 'train_mode')])
                          },
                          cache_dirpath=config.env.cache_dirpath)

    tfidf_char_vectorizer = Step(
        name='tfidf_char_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_char_vectorizer),
        input_steps=[xy_split],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)
    tfidf_word_vectorizer = Step(
        name='tfidf_word_vectorizer',
        transformer=TfidfVectorizer(**config.tfidf_word_vectorizer),
        input_steps=[xy_split],
        adapter={
            'text': ([('xy_split', 'X')], fetch_x_train),
        },
        cache_dirpath=config.env.cache_dirpath)

    glove_embeddings = Step(
        name='glove_embeddings',
        transformer=GloveEmbeddingsMatrix(**config.glove_embeddings),
        input_steps=[word_tokenizer],
        adapter={
            'tokenizer': ([('word_tokenizer', 'tokenizer')]),
        },
        cache_dirpath=config.env.cache_dirpath)

    log_reg_multi = Step(
        name='log_reg_multi',
        transformer=LogisticRegressionMultilabel(
            **config.logistic_regression_multilabel),
        input_steps=[xy_split, tfidf_char_vectorizer, tfidf_word_vectorizer],
        adapter={
            'X':
            ([('tfidf_char_vectorizer', 'features'),
              ('tfidf_word_vectorizer', 'features')], sparse_hstack_inputs),
            'y': ([('xy_split', 'y')]),
        },
        cache_dirpath=config.env.cache_dirpath,
        cache_output=True)

    char_vdcnn = Step(name='char_vdcnn',
                      transformer=CharVDCNN(**config.char_vdcnn_network),
                      input_steps=[char_tokenizer, xy_split],
                      adapter={
                          'X': ([('char_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    word_lstm = Step(name='word_lstm',
                     transformer=WordLSTM(**config.word_lstm_network),
                     input_steps=[word_tokenizer, xy_split],
                     adapter={
                         'X': ([('word_tokenizer', 'X')]),
                         'y': ([('xy_split', 'y')]),
                     },
                     cache_dirpath=config.env.cache_dirpath,
                     cache_output=True)
    glove_lstm = Step(name='glove_lstm',
                      transformer=GloveLSTM(**config.glove_lstm_network),
                      input_steps=[word_tokenizer, xy_split, glove_embeddings],
                      adapter={
                          'X': ([('word_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                          'embedding_matrix':
                          ([('glove_embeddings', 'embeddings_matrix')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)
    glove_scnn = Step(name='glove_scnn',
                      transformer=GloveSCNN(**config.glove_scnn_network),
                      input_steps=[word_tokenizer, xy_split, glove_embeddings],
                      adapter={
                          'X': ([('word_tokenizer', 'X')]),
                          'y': ([('xy_split', 'y')]),
                          'embedding_matrix':
                          ([('glove_embeddings', 'embeddings_matrix')]),
                      },
                      cache_dirpath=config.env.cache_dirpath,
                      cache_output=True)

    glove_dpcnn = Step(
        name='glove_dpcnn',
        transformer=GloveDPCNN(**config.glove_dpcnn_network),
        input_steps=[word_tokenizer, xy_split, glove_embeddings],
        adapter={
            'X': ([('word_tokenizer', 'X')]),
            'y': ([('xy_split', 'y')]),
            'embedding_matrix': ([('glove_embeddings', 'embeddings_matrix')]),
        },
        cache_dirpath=config.env.cache_dirpath,
        cache_output=True)

    return [
        log_reg_multi, char_vdcnn, word_lstm, glove_lstm, glove_scnn,
        glove_dpcnn
    ]