def am_glove_fasttext(data):
    job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, 
              model_package = 'keras')
    max_features = 40000
    max_seq_len = 700
    embedding_dims = 300
    batch_size = 256
    nb_epoch = 200
    
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], 
                                               max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size,
                         validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None
Beispiel #2
0
 def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0):
     pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence(max_features)),
                                ('padd_seq', PadNumericSequence(max_seq_len))])
     m_pre_proc = pipeline.fit(data.df[fs_text].values)
     text_num = m_pre_proc.transform(data.df[fs_text].values)
     # text_num = np.array(text_num)
     return text_num, m_pre_proc
def ak_embedding_cnn_lstm(data):
    job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('cnn_lstm', cnn_lstm_model)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      cnn_lstm__embedding_dims = [50])
    job.run(pipeline, parameters, data)
    return None
def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0):
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence())])
    p = pipeline.fit(data.df[fs_text].values)
    text_num = p.transform(data.df[fs_text].values)
    print(text_num)
    text_num = pd.DataFrame(text_num)
    fe_columns = ['text_numeric']
    text_num.columns = fe_columns

    data.df = data.df.join(text_num)
    data.fs_ind = fe_columns
    return data
def aj_embedding_fasttext(data):
    job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1)
    ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('ft', ft_model)])
    # TODO: add ngram features based on the paper
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      ft__max_seq_len = [300],
                      ft__embedding_dims = [100])
    job.run(pipeline, parameters, data)
    return None
def al_glove_cnn_lstm(data):
    job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000,
                                               embedding_dims = 300)
    glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, 
                                       nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('g_c_l', glove_cnn_lstm_m)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      g_c_l__max_seq_len = [300],
                      g_c_l__embedding_dims = [300])
    job.run(pipeline, parameters, data)
    return None
def ao_multi_fltr_glove_cnn(data):
    job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1)
    max_features = 20000
    max_seq_len = 300
    embedding_dims = 300
    batch_size = 64
    nb_epoch = 10
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None