def ec_AdaBoost_Extratree(data): job = Job('ec_AdaBoost_Extratree', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('ada_extraTree', AdaBoostRegressor(ExtraTreesRegressor()))]) parameters = dict(ada_extraTree__base_estimator__n_estimators = [10]) job.run(pipeline, parameters, data) return None
def am_glove_fasttext(data): job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, model_package = 'keras') max_features = 40000 max_seq_len = 700 embedding_dims = 300 batch_size = 256 nb_epoch = 200 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size, validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None
def ae_tfidf_BayesianRidge(data): job = Job('ae_tfidf_BayesianRidge') pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')), ('br', BayesianRidge())]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)]) job.run(pipeline, parameters, data) return None
def ab_tfidf_elasticnet(data): job = Job('ab_tfidf_elasticnet', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')), ('elnet', SGDClassifier(penalty="elasticnet"))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2), (1, 3)], # , # [(1, 3)] elnet__alpha = [1e-5, 1e-4, 1e-3, 1e-2], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__l1_ratio = [0.1, 0.5, 0.8, 0.9, 0.99]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def af_vecAvg_MaxEnt_OutputCode(data): job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def aa_tfidf_MaxEnt(data): job = Job('aa_tfidf_MaxEnt', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', LogisticRegression())]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__C = [0.001, 0.01, 0.1, 1, 10]) job.run(pipeline, parameters, data) return None
def ac_truncSVD_GBM(data): job = Job('ac_truncSVD_GBM', cv = cv_n_fold) data_tSVD = copy.deepcopy(data) data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, ngram_range = (1, 2), n_components = 2000, verbose=1) # n_components = 2000 --> variance explained = pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [100, 300, 500]) job.run(pipeline, parameters, data_tSVD) return None
def ak_embedding_cnn_lstm(data): job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('cnn_lstm', cnn_lstm_model)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], cnn_lstm__embedding_dims = [50]) job.run(pipeline, parameters, data) return None
def aa_tfidf_MaxEnt_OutputCode(data): job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def ag_vecAvg_randomForest(data): job = Job('ag_vecAvg_randomForest', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fl_word_vectors_zip, fl_word_vectors, dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [30, 90, 270]) job.run(pipeline, parameters, data) return None
def af_vecAvg_MaxEnt(data): job = Job('af_vecAvg_MaxEnt', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fl_word_vectors_zip, fl_word_vectors, dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', LogisticRegression())]) parameters = dict(m__C = [0.001, 0.01, 0.1, 1, 10]) job.run(pipeline, parameters, data) return None
def ab_tfidf_elasticnet_OutputCode(data): job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', min_df = 5)), ('elnet', OutputCodeClassifier( SGDClassifier(penalty="elasticnet"), code_size = 100))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], # , # [(1, 3)] elnet__estimator__alpha = [0.0001], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def aj_embedding_fasttext(data): job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1) ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('ft', ft_model)]) # TODO: add ngram features based on the paper parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], ft__max_seq_len = [300], ft__embedding_dims = [100]) job.run(pipeline, parameters, data) return None
def ae_truncSVD_randomForest(data): data_tSVD = copy.deepcopy(data) data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, ngram_range = (1, 2), n_components = 2000, verbose=1) # n_components = 2000 --> variance explained = job = Job('ad_truncSVD_randomForest', cv = cv_n_fold) pipeline = Pipeline(steps=[('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [10, 30, 90, 270, 810], rf__max_features = [60, 80, 'auto', 120, 140], rf__max_depth = [5, 10, 15, None], rf__min_samples_split = [2, 5, 10]) job.run(pipeline, parameters, data_tSVD) return None
def al_glove_cnn_lstm(data): job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000, embedding_dims = 300) glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('g_c_l', glove_cnn_lstm_m)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], g_c_l__max_seq_len = [300], g_c_l__embedding_dims = [300]) job.run(pipeline, parameters, data) return None
def ao_multi_fltr_glove_cnn(data): job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1) max_features = 20000 max_seq_len = 300 embedding_dims = 300 batch_size = 64 nb_epoch = 10 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None
def am_glove_fasttext_CI_DRM_IDC_Target_noTP(data, train_or_load = 'train'): job = Job('CI_DRM_IDC_Target_noTP', cv = cv_n_fold_dl, n_threads = 1, save_model_tf = True, model_package = None) if train_or_load == 'train': try: max_features = 25000 max_seq_len = 400 embedding_dims = 300 batch_size = 64 nb_epoch = 200 def pre_process_data_for_deep_learning(data, fs_text = 'text', verbose = 0): pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence(max_features)), ('padd_seq', PadNumericSequence(max_seq_len))]) m_pre_proc = pipeline.fit(data.df[fs_text].values) text_num = m_pre_proc.transform(data.df[fs_text].values) # text_num = np.array(text_num) return text_num, m_pre_proc text_num, m_pre_proc = pre_process_data_for_deep_learning(data, fs_text = data.fs_ind) global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) model = glove_fasttext(max_features = max_features, embedding_dims = embedding_dims, max_seq_len = max_seq_len) callback_lst = get_callbacks(job) if len(data.idx_valid) != 0: model.fit(text_num[data.idx_train], to_categorical(data.y_train(), nb_classes = n_output_hidden_units), validation_data = (text_num[data.idx_valid], to_categorical(data.y_valid(), nb_classes = n_output_hidden_units)), batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1, callbacks = callback_lst) print('CV score:', scoring_function(data.y_valid(), model.predict_classes(text_num[data.idx_valid]))) else: model.fit(text_num[data.idx_train], to_categorical(data.y_train(), nb_classes = n_output_hidden_units), validation_split = 0.05, batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1, callbacks = callback_lst) fld.create_fld_if_not_exist(fld.get_path(fld.model_scoring, job.job_name, 'model')) path = fld.get_path(fld.model_scoring, job.job_name, 'model', 'model.h5') model.save(path) path = fld.get_path(fld.model_scoring, job.job_name, 'model', 'model_pre_proc.pkl') joblib.dump(m_pre_proc, path) print('Model saved') except Exception as e: error_code = 'model_training_failed' error_log = (str(e)) print(error_code) print(error_log) elif train_or_load == 'load': path = fld.get_path(fld.model_scoring, job.job_name, 'model', 'model_pre_proc.pkl') m_pre_proc = joblib.load(path) path = fld.get_path(fld.model_scoring, job.job_name, 'model', 'model.h5') model = load_model(path) return m_pre_proc, model else: raise('train_or_load should be either "train" or "load"') return None
n_output_hidden_units = 38 cv_n_fold = 10 cv_n_fold_dl = 0 # Set Base Path---------------------------------------------------------------- os.chdir(path_base) sys.path.append(path_base) from q_scripts.a_class_func_dir import DirStructure, Job, Data # Read Config File: ----------------------------------------------------------- fld = DirStructure('config.ini') data = Data(pd_or_np='pd', fl_submission_input=None) job = Job('CI_DRM_IDC_Target_noTP', cv=cv_n_fold_dl, n_threads=1, save_model_tf=True, model_package=None) path = fld.get_path(fld.model_scoring, job.job_name, 'model_pre_proc.pkl') m_pre_proc = joblib.load(path) path = fld.get_path(fld.model_scoring, job.job_name, 'model.h5') model = load_model(path) def checkData(obs): # Checks the data after preprocessing and returns the status. obs = "".join(obs) if obs.strip() == "": status = status = { 'Error': [{ 'value':
def ba_GBM(data, name = ''): job = Job('ba_GBM_' + name, cv = cv_n_fold) pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [2]) job.run(pipeline, parameters, data) return None
def ed_GBM(data): job = Job('ed_GBM', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [30, 100, 300]) job.run(pipeline, parameters, data) return None
def ea_regression(data): job = Job('ea_regression', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('reg', SGDClassifier())]) parameters = dict(reg__alpha = [0.0001]) job.run(pipeline, parameters, data) return None
def eb_randomForest(data): job = Job('eb_randomForest', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [3, 9, 27]) job.run(pipeline, parameters, data) return None