def am_glove_fasttext(data): job = Job('am_glove_fasttext', cv = cv_n_fold_dl, n_threads = 1, model_package = 'keras') max_features = 40000 max_seq_len = 700 embedding_dims = 300 batch_size = 256 nb_epoch = 200 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=glove_fasttext, batch_size=batch_size, validation_split = 0.1, nb_epoch=nb_epoch, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None
def ec_AdaBoost_Extratree(data): job = Job('ec_AdaBoost_Extratree', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('ada_extraTree', AdaBoostRegressor(ExtraTreesRegressor()))]) parameters = dict(ada_extraTree__base_estimator__n_estimators = [10]) job.run(pipeline, parameters, data) return None
def ae_tfidf_BayesianRidge(data): job = Job('ae_tfidf_BayesianRidge') pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')), ('br', BayesianRidge())]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)]) job.run(pipeline, parameters, data) return None
def af_vecAvg_MaxEnt_OutputCode(data): job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def ab_tfidf_elasticnet(data): job = Job('ab_tfidf_elasticnet', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english')), ('elnet', SGDClassifier(penalty="elasticnet"))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2), (1, 3)], # , # [(1, 3)] elnet__alpha = [1e-5, 1e-4, 1e-3, 1e-2], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__l1_ratio = [0.1, 0.5, 0.8, 0.9, 0.99]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def ac_truncSVD_GBM(data): job = Job('ac_truncSVD_GBM', cv = cv_n_fold) data_tSVD = copy.deepcopy(data) data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, ngram_range = (1, 2), n_components = 2000, verbose=1) # n_components = 2000 --> variance explained = pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [100, 300, 500]) job.run(pipeline, parameters, data_tSVD) return None
def aa_tfidf_MaxEnt(data): job = Job('aa_tfidf_MaxEnt', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', LogisticRegression())]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__C = [0.001, 0.01, 0.1, 1, 10]) job.run(pipeline, parameters, data) return None
def ak_embedding_cnn_lstm(data): job = Job('ak_embedding_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) cnn_lstm_model = KerasClassifier(build_fn=cnn_lstm, batch_size=32, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('cnn_lstm', cnn_lstm_model)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], cnn_lstm__embedding_dims = [50]) job.run(pipeline, parameters, data) return None
def ag_vecAvg_randomForest(data): job = Job('ag_vecAvg_randomForest', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fl_word_vectors_zip, fl_word_vectors, dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [30, 90, 270]) job.run(pipeline, parameters, data) return None
def af_vecAvg_MaxEnt(data): job = Job('af_vecAvg_MaxEnt', cv = cv_n_fold) pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fl_word_vectors_zip, fl_word_vectors, dim = 300, all_text_data = list(data.df[data.fs_ind]))), ('m', LogisticRegression())]) parameters = dict(m__C = [0.001, 0.01, 0.1, 1, 10]) job.run(pipeline, parameters, data) return None
def aa_tfidf_MaxEnt_OutputCode(data): job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', max_features = 2000, min_df = 5)), ('m', OutputCodeClassifier(LogisticRegression(), code_size = 10))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], m__estimator__C = [0.01]) job.run(pipeline, parameters, data) return None
def ab_tfidf_elasticnet_OutputCode(data): job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold) pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english', min_df = 5)), ('elnet', OutputCodeClassifier( SGDClassifier(penalty="elasticnet"), code_size = 100))]) parameters = dict(tfidf__norm = ['l2'], tfidf__ngram_range = [(1, 2)], # , # [(1, 3)] elnet__estimator__alpha = [0.0001], # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99] job.run(pipeline, parameters, data) return None
def ae_truncSVD_randomForest(data): data_tSVD = copy.deepcopy(data) data_tSVD = get_tfidf_truncSVD_features(data_tSVD, fs_text = data.fs_ind, ngram_range = (1, 2), n_components = 2000, verbose=1) # n_components = 2000 --> variance explained = job = Job('ad_truncSVD_randomForest', cv = cv_n_fold) pipeline = Pipeline(steps=[('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [10, 30, 90, 270, 810], rf__max_features = [60, 80, 'auto', 120, 140], rf__max_depth = [5, 10, 15, None], rf__min_samples_split = [2, 5, 10]) job.run(pipeline, parameters, data_tSVD) return None
def aj_embedding_fasttext(data): job = Job('aj_embedding_fasttext', cv = cv_n_fold_dl, n_threads = 1) ft_model = KerasClassifier(build_fn=fasttext, batch_size=32, nb_epoch=5, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('ft', ft_model)]) # TODO: add ngram features based on the paper parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], ft__max_seq_len = [300], ft__embedding_dims = [100]) job.run(pipeline, parameters, data) return None
def al_glove_cnn_lstm(data): job = Job('al_glove_cnn_lstm', cv = cv_n_fold_dl, n_threads = 1) global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=20000, embedding_dims = 300) glove_cnn_lstm_m = KerasClassifier(build_fn=glove_cnn_lstm, batch_size=64, nb_epoch=10, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('g_c_l', glove_cnn_lstm_m)]) parameters = dict(txt_to_seq__n_max_features = [20000], padd_seq__max_seq_len = [300], g_c_l__max_seq_len = [300], g_c_l__embedding_dims = [300]) job.run(pipeline, parameters, data) return None
def ao_multi_fltr_glove_cnn(data): job = Job('ao_multi_fltr_glove_cnn', cv = cv_n_fold_dl, n_threads = 1) max_features = 20000 max_seq_len = 300 embedding_dims = 300 batch_size = 64 nb_epoch = 10 global embedding_matrix embedding_matrix = create_embedding_matrix(data.df[data.fs_ind], max_features=max_features, embedding_dims = embedding_dims) m = KerasClassifier(build_fn=multi_fltr_glove_cnn, batch_size=batch_size, nb_epoch=nb_epoch, validation_split = 0.1, verbose = 1) pipeline = Pipeline(steps=[('txt_to_seq', TextToNumericSequence()), ('padd_seq', PadNumericSequence()), ('m', m)]) parameters = dict(txt_to_seq__n_max_features = [max_features], padd_seq__max_seq_len = [max_seq_len], m__max_features = [max_features], m__max_seq_len = [max_seq_len], m__embedding_dims = [embedding_dims]) job.run(pipeline, parameters, data) return None
def ed_GBM(data): job = Job('ed_GBM', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [30, 100, 300]) job.run(pipeline, parameters, data) return None
def eb_randomForest(data): job = Job('eb_randomForest', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('rf', RandomForestClassifier())]) parameters = dict(rf__n_estimators = [3, 9, 27]) job.run(pipeline, parameters, data) return None
def ea_regression(data): job = Job('ea_regression', cv = cv_n_fold, ensamble_model_job = True) pipeline = Pipeline(steps=[('reg', SGDClassifier())]) parameters = dict(reg__alpha = [0.0001]) job.run(pipeline, parameters, data) return None
def ba_GBM(data, name = ''): job = Job('ba_GBM_' + name, cv = cv_n_fold) pipeline = Pipeline(steps=[('gbm', GradientBoostingClassifier())]) parameters = dict(gbm__n_estimators = [2]) job.run(pipeline, parameters, data) return None