def ec_AdaBoost_Extratree(data):
    job = Job('ec_AdaBoost_Extratree', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('ada_extraTree', 
                                AdaBoostRegressor(ExtraTreesRegressor()))])
    parameters = dict(ada_extraTree__base_estimator__n_estimators = [10])
    job.run(pipeline, parameters, data)
    return None
def am_glove_fasttext(data):
    job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, 
              model_package = 'keras')
    max_features = 40000
    max_seq_len = 700
    embedding_dims = 300
    batch_size = 256
    nb_epoch = 200
    
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], 
                                               max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size,
                         validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None
def ae_tfidf_BayesianRidge(data):
    job = Job('ae_tfidf_BayesianRidge')
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')),
                               ('br', BayesianRidge())])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)])
    job.run(pipeline, parameters, data)
    return None    
def ab_tfidf_elasticnet(data):
    job = Job('ab_tfidf_elasticnet', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')),
                               ('elnet', SGDClassifier(penalty="elasticnet"))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2), (1, 3)], # ,      # [(1, 3)]
                      elnet__alpha = [1e-5, 1e-4, 1e-3, 1e-2],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__l1_ratio = [0.1, 0.5, 0.8, 0.9, 0.99]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
def af_vecAvg_MaxEnt_OutputCode(data):
    job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), 
                                                              dim = 300,
                                                              all_text_data = list(data.df[data.fs_ind]))),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def aa_tfidf_MaxEnt(data):
    job = Job('aa_tfidf_MaxEnt', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', LogisticRegression())])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__C = [0.001, 0.01, 0.1, 1, 10])
    job.run(pipeline, parameters, data)
    return None
def ac_truncSVD_GBM(data):
    job = Job('ac_truncSVD_GBM', cv = cv_n_fold)
    data_tSVD = copy.deepcopy(data)
    data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, 
                                            ngram_range = (1, 2),
                                n_components = 2000, verbose=1)
    # n_components = 2000 --> variance explained = 
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [100, 300, 500])
    job.run(pipeline, parameters, data_tSVD)
    return None
def ak_embedding_cnn_lstm(data):
    job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('cnn_lstm', cnn_lstm_model)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      cnn_lstm__embedding_dims = [50])
    job.run(pipeline, parameters, data)
    return None
def aa_tfidf_MaxEnt_OutputCode(data):
    job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def ag_vecAvg_randomForest(data):
    job = Job('ag_vecAvg_randomForest', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg",
                                Word2VecTransformer(fl_word_vectors_zip,
                                                    fl_word_vectors,
                                                    dim = 300,
                                                    all_text_data = 
                                                     list(data.df[data.fs_ind]))),
                               ('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [30, 90, 270])
    job.run(pipeline, parameters, data)
    return None
def af_vecAvg_MaxEnt(data):
    job = Job('af_vecAvg_MaxEnt', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg",
                                Word2VecTransformer(fl_word_vectors_zip,
                                                    fl_word_vectors,
                                                    dim = 300,
                                                    all_text_data = 
                                                     list(data.df[data.fs_ind]))),
                               ('m', LogisticRegression())])
    parameters = dict(m__C = [0.001, 0.01, 0.1, 1, 10])
    job.run(pipeline, parameters, data)
    return None
def ab_tfidf_elasticnet_OutputCode(data):
    job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         min_df = 5)),
                               ('elnet', OutputCodeClassifier(
                               SGDClassifier(penalty="elasticnet"),
                               code_size = 100))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)], # ,      # [(1, 3)]
                      elnet__estimator__alpha = [0.0001],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
def aj_embedding_fasttext(data):
    job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1)
    ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('ft', ft_model)])
    # TODO: add ngram features based on the paper
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      ft__max_seq_len = [300],
                      ft__embedding_dims = [100])
    job.run(pipeline, parameters, data)
    return None
def ae_truncSVD_randomForest(data):
    data_tSVD = copy.deepcopy(data)
    data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, 
                                            ngram_range = (1, 2),
                                n_components = 2000, verbose=1)
    # n_components = 2000 --> variance explained = 
    job = Job('ad_truncSVD_randomForest', cv = cv_n_fold)
    pipeline = Pipeline(steps=[('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [10, 30, 90, 270, 810],
                      rf__max_features = [60, 80, 'auto', 120, 140],
                      rf__max_depth = [5, 10, 15, None],
                      rf__min_samples_split = [2, 5, 10])
    job.run(pipeline, parameters, data_tSVD)
    return None
def al_glove_cnn_lstm(data):
    job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000,
                                               embedding_dims = 300)
    glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, 
                                       nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('g_c_l', glove_cnn_lstm_m)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      g_c_l__max_seq_len = [300],
                      g_c_l__embedding_dims = [300])
    job.run(pipeline, parameters, data)
    return None
def ao_multi_fltr_glove_cnn(data):
    job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1)
    max_features = 20000
    max_seq_len = 300
    embedding_dims = 300
    batch_size = 64
    nb_epoch = 10
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None
Example #17
0
def am_glove_fasttext_CI_DRM_IDC_Target_noTP(data, train_or_load = 'train'):
    job = Job('CI_DRM_IDC_Target_noTP', cv = cv_n_fold_dl, n_threads = 1,
              save_model_tf = True, model_package = None)
    if train_or_load == 'train':
        try:
            max_features = 25000
            max_seq_len = 400
            embedding_dims = 300
            batch_size = 64
            nb_epoch = 200
            
            def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0):
                pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence(max_features)),
                                           ('padd_seq', PadNumericSequence(max_seq_len))])
                m_pre_proc = pipeline.fit(data.df[fs_text].values)
                text_num = m_pre_proc.transform(data.df[fs_text].values)
                # text_num = np.array(text_num)
                return text_num, m_pre_proc
            text_num, m_pre_proc = pre_process_data_for_deep_learning(data, 
                                                                     fs_text = data.fs_ind)
            
            global embedding_matrix
            embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], 
                                                       max_features=max_features,
                                                       embedding_dims = embedding_dims)
            model = glove_fasttext(max_features = max_features, 
                                   embedding_dims = embedding_dims, 
                                   max_seq_len = max_seq_len)
            callback_lst = get_callbacks(job)
            if len(data.idx_valid) != 0:
                model.fit(text_num[data.idx_train], to_categorical(data.y_train(), 
                                                      nb_classes = n_output_hidden_units),
                          validation_data = (text_num[data.idx_valid], 
                                             to_categorical(data.y_valid(), 
                                               nb_classes = n_output_hidden_units)),
                          batch_size = batch_size, nb_epoch = nb_epoch,  verbose = 1,
                          callbacks = callback_lst)
                print('CV score:', scoring_function(data.y_valid(),
                                     model.predict_classes(text_num[data.idx_valid])))
            else:
               model.fit(text_num[data.idx_train], to_categorical(data.y_train(), 
                          nb_classes = n_output_hidden_units),
                          validation_split = 0.05,
                          batch_size = batch_size, nb_epoch = nb_epoch,  verbose = 1,
                          callbacks = callback_lst)
            
            fld.create_fld_if_not_exist(fld.get_path(fld.model_scoring, 
                                                     job.job_name, 'model'))
            path = fld.get_path(fld.model_scoring, job.job_name, 'model', 
                                        'model.h5')
            model.save(path)
            path = fld.get_path(fld.model_scoring, job.job_name, 'model', 
                                        'model_pre_proc.pkl')
            joblib.dump(m_pre_proc, path)
            print('Model saved')
        except Exception as e:
                error_code = 'model_training_failed'
                error_log = (str(e))
                print(error_code)
                print(error_log)
    elif train_or_load == 'load':
        path = fld.get_path(fld.model_scoring, job.job_name, 'model', 
                                    'model_pre_proc.pkl')
        m_pre_proc = joblib.load(path)
        path = fld.get_path(fld.model_scoring, job.job_name, 'model',
                                    'model.h5')
        model = load_model(path)
        return m_pre_proc, model
    else:
        raise('train_or_load should be either "train" or "load"')
    return None
n_output_hidden_units = 38
cv_n_fold = 10
cv_n_fold_dl = 0

# Set Base Path----------------------------------------------------------------
os.chdir(path_base)
sys.path.append(path_base)
from q_scripts.a_class_func_dir import DirStructure, Job, Data

# Read Config File: -----------------------------------------------------------
fld = DirStructure('config.ini')
data = Data(pd_or_np='pd', fl_submission_input=None)
job = Job('CI_DRM_IDC_Target_noTP',
          cv=cv_n_fold_dl,
          n_threads=1,
          save_model_tf=True,
          model_package=None)
path = fld.get_path(fld.model_scoring, job.job_name, 'model_pre_proc.pkl')
m_pre_proc = joblib.load(path)
path = fld.get_path(fld.model_scoring, job.job_name, 'model.h5')
model = load_model(path)


def checkData(obs):
    # Checks the data after preprocessing and returns the status.
    obs = "".join(obs)
    if obs.strip() == "":
        status = status = {
            'Error': [{
                'value':
def ba_GBM(data, name = ''):
    job = Job('ba_GBM_' + name, cv = cv_n_fold)
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [2])
    job.run(pipeline, parameters, data)
    return None
def ed_GBM(data):
    job = Job('ed_GBM', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [30, 100, 300])
    job.run(pipeline, parameters, data)
    return None
def ea_regression(data):
    job = Job('ea_regression', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('reg', SGDClassifier())])
    parameters = dict(reg__alpha = [0.0001])
    job.run(pipeline, parameters, data)
    return None
def eb_randomForest(data):
    job = Job('eb_randomForest', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [3, 9, 27])
    job.run(pipeline, parameters, data)
    return None