def am_glove_fasttext(data):
    job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, 
              model_package = 'keras')
    max_features = 40000
    max_seq_len = 700
    embedding_dims = 300
    batch_size = 256
    nb_epoch = 200
    
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], 
                                               max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size,
                         validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None
def ec_AdaBoost_Extratree(data):
    job = Job('ec_AdaBoost_Extratree', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('ada_extraTree', 
                                AdaBoostRegressor(ExtraTreesRegressor()))])
    parameters = dict(ada_extraTree__base_estimator__n_estimators = [10])
    job.run(pipeline, parameters, data)
    return None
def ae_tfidf_BayesianRidge(data):
    job = Job('ae_tfidf_BayesianRidge')
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')),
                               ('br', BayesianRidge())])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)])
    job.run(pipeline, parameters, data)
    return None    
def af_vecAvg_MaxEnt_OutputCode(data):
    job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), 
                                                              dim = 300,
                                                              all_text_data = list(data.df[data.fs_ind]))),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def ab_tfidf_elasticnet(data):
    job = Job('ab_tfidf_elasticnet', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')),
                               ('elnet', SGDClassifier(penalty="elasticnet"))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2), (1, 3)], # ,      # [(1, 3)]
                      elnet__alpha = [1e-5, 1e-4, 1e-3, 1e-2],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__l1_ratio = [0.1, 0.5, 0.8, 0.9, 0.99]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
def ac_truncSVD_GBM(data):
    job = Job('ac_truncSVD_GBM', cv = cv_n_fold)
    data_tSVD = copy.deepcopy(data)
    data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, 
                                            ngram_range = (1, 2),
                                n_components = 2000, verbose=1)
    # n_components = 2000 --> variance explained = 
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [100, 300, 500])
    job.run(pipeline, parameters, data_tSVD)
    return None
def aa_tfidf_MaxEnt(data):
    job = Job('aa_tfidf_MaxEnt', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', LogisticRegression())])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__C = [0.001, 0.01, 0.1, 1, 10])
    job.run(pipeline, parameters, data)
    return None
def ak_embedding_cnn_lstm(data):
    job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('cnn_lstm', cnn_lstm_model)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      cnn_lstm__embedding_dims = [50])
    job.run(pipeline, parameters, data)
    return None
def ag_vecAvg_randomForest(data):
    job = Job('ag_vecAvg_randomForest', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg",
                                Word2VecTransformer(fl_word_vectors_zip,
                                                    fl_word_vectors,
                                                    dim = 300,
                                                    all_text_data = 
                                                     list(data.df[data.fs_ind]))),
                               ('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [30, 90, 270])
    job.run(pipeline, parameters, data)
    return None
def af_vecAvg_MaxEnt(data):
    job = Job('af_vecAvg_MaxEnt', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg",
                                Word2VecTransformer(fl_word_vectors_zip,
                                                    fl_word_vectors,
                                                    dim = 300,
                                                    all_text_data = 
                                                     list(data.df[data.fs_ind]))),
                               ('m', LogisticRegression())])
    parameters = dict(m__C = [0.001, 0.01, 0.1, 1, 10])
    job.run(pipeline, parameters, data)
    return None
def aa_tfidf_MaxEnt_OutputCode(data):
    job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def ab_tfidf_elasticnet_OutputCode(data):
    job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         min_df = 5)),
                               ('elnet', OutputCodeClassifier(
                               SGDClassifier(penalty="elasticnet"),
                               code_size = 100))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)], # ,      # [(1, 3)]
                      elnet__estimator__alpha = [0.0001],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
def ae_truncSVD_randomForest(data):
    data_tSVD = copy.deepcopy(data)
    data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, 
                                            ngram_range = (1, 2),
                                n_components = 2000, verbose=1)
    # n_components = 2000 --> variance explained = 
    job = Job('ad_truncSVD_randomForest', cv = cv_n_fold)
    pipeline = Pipeline(steps=[('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [10, 30, 90, 270, 810],
                      rf__max_features = [60, 80, 'auto', 120, 140],
                      rf__max_depth = [5, 10, 15, None],
                      rf__min_samples_split = [2, 5, 10])
    job.run(pipeline, parameters, data_tSVD)
    return None
def aj_embedding_fasttext(data):
    job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1)
    ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('ft', ft_model)])
    # TODO: add ngram features based on the paper
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      ft__max_seq_len = [300],
                      ft__embedding_dims = [100])
    job.run(pipeline, parameters, data)
    return None
def al_glove_cnn_lstm(data):
    job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1)
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000,
                                               embedding_dims = 300)
    glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, 
                                       nb_epoch=10,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('g_c_l', glove_cnn_lstm_m)])
    parameters = dict(txt_to_seq__n_max_features = [20000],
                      padd_seq__max_seq_len = [300],
                      g_c_l__max_seq_len = [300],
                      g_c_l__embedding_dims = [300])
    job.run(pipeline, parameters, data)
    return None
def ao_multi_fltr_glove_cnn(data):
    job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1)
    max_features = 20000
    max_seq_len = 300
    embedding_dims = 300
    batch_size = 64
    nb_epoch = 10
    global embedding_matrix
    embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features,
                                               embedding_dims = embedding_dims)
    m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch,
                               validation_split = 0.1, verbose = 1)
    pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()),
                               ('padd_seq', PadNumericSequence()),
                               ('m', m)])
    parameters = dict(txt_to_seq__n_max_features = [max_features],
                      padd_seq__max_seq_len = [max_seq_len],
                      m__max_features = [max_features],
                      m__max_seq_len = [max_seq_len],
                      m__embedding_dims = [embedding_dims])
    job.run(pipeline, parameters, data)
    return None
def ed_GBM(data):
    job = Job('ed_GBM', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [30, 100, 300])
    job.run(pipeline, parameters, data)
    return None
def eb_randomForest(data):
    job = Job('eb_randomForest', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('rf', RandomForestClassifier())])
    parameters = dict(rf__n_estimators = [3, 9, 27])
    job.run(pipeline, parameters, data)
    return None
def ea_regression(data):
    job = Job('ea_regression', cv = cv_n_fold, ensamble_model_job = True)
    pipeline = Pipeline(steps=[('reg', SGDClassifier())])
    parameters = dict(reg__alpha = [0.0001])
    job.run(pipeline, parameters, data)
    return None
def ba_GBM(data, name = ''):
    job = Job('ba_GBM_' + name, cv = cv_n_fold)
    pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())])
    parameters = dict(gbm__n_estimators = [2])
    job.run(pipeline, parameters, data)
    return None