def get_testing_metrics(model, X, y, metrics, as_indexes, n_folds, X_test=None): y_pred = cross_val_predict( model, X, y, cv=StratifiedKFold( y, n_folds=n_folds, shuffle=True, random_state=RANDOM_STATE ) ) print "y_pred", y_pred model.fit(X, y) result = get_y_true_y_pred_based_metrics(y, y_pred, metrics) if FEATURES in metrics: result[FEATURES] = model.get_support(indices=True) if OBJECTS in metrics: if as_indexes: result[OBJECTS] = [get_data_keeper().get_object_name_by_index(index) for (index,) in X] else: result[OBJECTS] = list(X.index) if TEST_PREDICTIONS in metrics: result[TEST_PREDICTIONS] = X_test, model.predict(X_test) return result
def make_new_generator(): start_time = time.time() X = get_data_keeper().get_common_x() print "matrix shape before:", X.shape # Матрица X = наша матрица мутаций (snps) X[X != 1] = 0 to_drop = (X.sum(axis=0) >= (X.shape[0] / 2)) | ( X.sum(axis=0) < 3 ) # Убираем столбцы (=позиции мутаций), где слищком малое число мутированных образцов to_drop = to_drop[to_drop].index X = X.drop(to_drop, axis=1) # save filtered SNPs matrix (add saving to temp directory) X.to_csv(RAW_X_BEFORE_SUBSET_GENERATION_PATH) print "matrix shape after:", X.shape # оставили только слолбцы, где более 3-х мутированных образцов sys.stdout.flush() # реализовать, чтобы генерация подмножеств не запускалась, если этого не требуется generator = SubsetGenerator() generator.generate_and_set(X.as_matrix().astype( np.uint8)) # генерируем набор подмножеств (запускаем модуль на Си++) print "generating done, time from start spent:", time.time() - start_time # временный файл для хранения сгенерированных подмножеств: нужно чтобы это было в каталоге $HOME/gwas/tmp generator.store(POSSIBLE_COMPLEX_FEATURES_PATH) print "storing done, time from start spent:", time.time() - start_time return generator, X
def run_experiment( params, experiment_name, drug, max_evals=100, as_indexes=True ): experiment_name_for_drug = get_experiment_name_for_drug(experiment_name, drug) results_dumper = ResultsDumper(experiment_name_for_drug) loss_getter = AccuracyLossGetter() inner_metrics_getter = MetricsGetter( metrics=ALL_METRICS, as_indexes=as_indexes, loss_func=loss_getter, n_folds=5, ) model = MetamodelFactory( metamodel_structure=params, feature_selector=None, results_dumper=results_dumper, metrics_getter=inner_metrics_getter, max_evals=max_evals ) X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes) n_folds = 5 if len(y) < 50: n_folds = 10 init_common() processes = list() for i, (train_index, test_index) in enumerate(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=RANDOM_STATE)): process = Process( target=run_experiment_fold, args=(model, X, y, train_index, test_index, i) ) processes.append(process) process.start() for process in processes: process.join()
def make_new_generator(): start_time = time.time() X = get_data_keeper().get_common_x() print "matrix shape before:", X.shape X[X!=1] = 0 to_drop = (X.sum(axis=0) >= (X.shape[0] / 2)) | (X.sum(axis=0) < 3) to_drop = to_drop[to_drop].index X = X.drop(to_drop, axis=1) X.to_csv(RAW_X_BEFORE_SUBSET_GENERATION_PATH) print "matrix shape after:", X.shape sys.stdout.flush() generator = SubsetGenerator() generator.generate_and_set(X.as_matrix().astype(np.uint8)) print "generating done, time from start spent:", time.time() - start_time generator.store(POSSIBLE_COMPLEX_FEATURES_PATH) print "storing done, time from start spent:", time.time() - start_time return generator, X
def run_experiment( params, experiment_name, drug, ): experiment_name_for_drug = get_experiment_name_for_drug( experiment_name, drug) results_dumper = ResultsDumper(experiment_name_for_drug) results_dumper.set_subdir(str(0)) X, y = get_data_keeper().get_train_data(drug, as_indexes=True) init_common() model = params model.fit(indexes=X, y=y) # save model.extender_strategy._result_feature_sets results_dumper.save_tuple(model.extender_strategy._result_feature_sets)
def run_experiment( params, experiment_name, drug, max_evals=100, as_indexes=True): experiment_name_for_drug = get_experiment_name_for_drug(experiment_name, drug) results_dumper = ResultsDumper(experiment_name_for_drug) loss_getter = AccuracyLossGetter() inner_metrics_getter = MetricsGetter( metrics=ALL_METRICS, as_indexes=as_indexes, loss_func=loss_getter, n_folds=5, ) model = HyperParameterSearcher( params=params, results_dumper=results_dumper, metrics_getter=inner_metrics_getter, max_evals=max_evals, ) X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes) n_folds = 5 if len(y) < 50: n_folds = 10 init_common() processes = list() for i, (train_index, test_index) in enumerate(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=RANDOM_STATE)): process = Process(target=run_experiment_fold, args=(model, X, y, train_index, test_index, i)) processes.append(process) process.start() for process in processes: process.join()
target=run_experiment_fold, args=(model, X, y, train_index, test_index, i) ) processes.append(process) process.start() for process in processes: process.join() ######## def run_model(drug): params = get_linear_model_params() return run_experiment( params=params, experiment_name='model', drug=drug, as_indexes=True, #as_indexes=False, max_evals=MAX_EVALS, ) if __name__ == '__main__': #run_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])]) run_model(get_data_keeper().get_possible_second_level_drugs()[int(2)]) #run_model(get_data_keeper().get_possible_first_level_drugs()[int(2)])
'feature': np.asarray(model_features[feature_indices]) }) df.to_csv( join(model.results_dumper.get_root_folder(), "final_model_features{}.csv".format(fold_index))) #model.results_dumper.plot_metrics_progress(metrics=PLOT_METRICS) print "Best hyperparams: ", model.get_hyperparams(deep=True) # Run experiment if __name__ == '__main__': drug_name = get_data_keeper().get_possible_second_level_drugs()[2] experiment_name_for_drug = get_experiment_name_for_drug( "simple_logreg_experiment_", drug_name) # create model my_model = SequentialModel(name='simpleLR') my_model.add(layer=get_linear_model()) # create metamodel my_metamodel = SimpleFeaturesMetamodel() my_metamodel.configure_params(inner_model=my_model) my_metamodel.set_result_dumper(result_dumper=ResultsDumper( experiment_name=experiment_name_for_drug)) my_metamodel.set_metrics_getter(metrics_getter=MetricsGetter(
get_model_params, \ get_complex_features_adder_wrapper, \ get_nothing_doing_extender_strategy, \ get_frn_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_model_params() frn_params = get_frn_params(inner_model_params) result_params = scope.get_complex_features_adder_wrapper( inner_model=frn_params, extender_strategy=scope.get_nothing_doing_extender_strategy(), ) return result_params def run_frn_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='frn_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_frn_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
# df.columns = [str(result.inner_model), 'pos'] df.to_csv(join('/home/roma/tb_gwas_experiments/experiments_results', "final_model_features{}.csv".format(fold_index))) #model.results_dumper.dump_final_result(model._result_model, model._result_metrics) ### calculate and plot model performance metrics ### read all files with models, read metrics for train data, calculate and plot metrics for test data model.results_dumper.plot_all_metrics() # Run experiment if __name__ == '__main__': drug_name = get_data_keeper().get_possible_second_level_drugs()[2] experiment_name_for_drug = get_experiment_name_for_drug("simple_logreg_model", drug_name) # create metamodels to test metamodelLR = SimpleMetamodel(experiment_name=experiment_name_for_drug) # load data - in X it will return INDEXES of points for which y exists X, y = get_data_keeper().get_train_data(drug_name, as_indexes=False) n_folds = 5 if len(y) < 50: n_folds = 10 #init_common()
get_model_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_model_params() feature_selection_params = get_feature_selector_params( inner_model_params=inner_model_params, ) result_params = scope.get_complex_features_adder_wrapper( inner_model=feature_selection_params, extender_strategy=scope.get_nothing_doing_extender_strategy(), ) return result_params def run_selector_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='selector_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])]) #run_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(2)])
get_model_params, \ get_complex_features_adder_wrapper, \ get_nothing_doing_extender_strategy, \ get_frn_params, \ get_boruta_feature_selector_params, \ get_feature_selector_estimator_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_feature_selector_estimator_params() result_params = get_simple_feature_adder_wrapper_params( inner_model_params=inner_model_params, ) return result_params def run_extender_robust_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='extender_robust_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_extender_robust_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
get_model_params, \ get_complex_features_adder_wrapper, \ get_nothing_doing_extender_strategy, \ get_frn_params, \ get_boruta_feature_selector_params, \ get_feature_selector_estimator_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_feature_selector_estimator_params() result_params = get_simple_feature_adder_wrapper_params( inner_model_params=inner_model_params, ) return result_params def run_extender_robust_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='extender_robust_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_extender_robust_model( get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
def test_model_with_drug(model, drug, metrics, as_indexes, n_folds=10): X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes) return get_testing_metrics(model, X, y, metrics, as_indexes, n_folds)
import sys import time from multiprocessing import Process from data_keeper import get_data_keeper from run_experiment import init_common from run_model_experiment import run_model from run_selector_model_experiment import run_selector_model from run_frn_model_experiment import run_frn_model from run_extender_selector_model_experiment import run_extender_selector_model from run_extender_frn_model_experiment import run_extender_frn_model from run_extender_robust_model_experiment import run_extender_robust_model if __name__ == "__main__": #drug = get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])] drug = get_data_keeper().get_possible_second_level_drugs()[2] start_time = time.time() init_common( ) # сгенерировать файл с комбинациями признаков (размер файла >4 Гб!!) processes = list() processes.append(Process(target=run_model, args=(drug, ))) processes.append(Process(target=run_frn_model, args=(drug, ))) processes.append(Process(target=run_selector_model, args=(drug, ))) processes.append(Process(target=run_extender_frn_model, args=(drug, ))) processes.append(Process(target=run_extender_selector_model, args=(drug, ))) processes.append(Process(target=run_extender_robust_model, args=(drug, ))) for process in processes:
from hyperparameter_search import get_simple_feature_adder_wrapper_params,\ get_feature_selector_params, \ get_model_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_model_params() # choose between parameters of model: RF, XGB or Log regression feature_selection_params = get_feature_selector_params( # choose between models for feature selection: Chi-squared or RF/XGB/LogRegr k most important features inner_model_params=inner_model_params, ) result_params = get_simple_feature_adder_wrapper_params( inner_model_params=feature_selection_params, ) return result_params def run_extender_selector_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='extender_selector_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_extender_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(1)])
from data_keeper import get_data_keeper from sklearn.model_selection import GridSearchCV from testing import test_models_with_drugs from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegressionCV get_data_keeper().get_possible_drugs() import wrappers from wrappers import GridSearchCVWrapper from wrappers import XGBoostClassifierFeatureImportances as XGB from wrappers import MatrixCleaningWrapper from wrappers import SparseWrapper from wrappers import ModelFeatureSelectionWrapper from wrappers import ModelBasedFeatureImportanceGetter from wrappers import AsMatrixWrapper from frn import FeatureRelevanceNetworkWrapper def get_complete_linear_model(): inner_model = LogisticRegressionCV(Cs=[10 ** i for i in xrange(-4, 4)], solver='liblinear') outer_model = GridSearchCV(inner_model, {'penalty': ['l1', 'l2']}) return MatrixCleaningWrapper(SparseWrapper(outer_model)) def get_complete_tree_based_model(): cv_params = {'inner_model__inner_model__n_estimators': [1],#, 5, 10, 20, 50, 100], 'feature_selection_threshold_coef': [0.1]}#, 1, 3, 10, 30, 100, 300]} return MatrixCleaningWrapper(FeatureRelevanceNetworkWrapper(XGB(n_estimators=100), ModelBasedFeatureImportanceGetter(XGB())))
get_feature_selector_params, \ get_model_params, \ get_complex_features_adder_wrapper, \ get_nothing_doing_extender_strategy, \ get_frn_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_model_params() frn_params = get_frn_params(inner_model_params) result_params = get_simple_feature_adder_wrapper_params( inner_model_params=frn_params, ) return result_params def run_extender_frn_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='extender_frn_model', drug=drug, as_indexes=True, max_evals=MAX_EVALS, ) if __name__ == '__main__': run_extender_frn_model(get_data_keeper().get_possible_second_level_drugs()[int(4)])
get_frn_params from common import MAX_EVALS def get_all_params(): inner_model_params = get_model_params() result_params = scope.get_complex_features_adder_wrapper( inner_model=inner_model_params, extender_strategy=scope.get_nothing_doing_extender_strategy(), ) return result_params def run_model(drug): params = get_all_params() return run_experiment( params=params, experiment_name='model', drug=drug, as_indexes=True, #as_indexes=False, max_evals=MAX_EVALS, ) if __name__ == '__main__': #run_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])]) #run_model(get_data_keeper().get_possible_second_level_drugs()[int(4)]) run_model(get_data_keeper().get_possible_first_level_drugs()[int( sys.argv[1])])
import sys import time from multiprocessing import Process from data_keeper import get_data_keeper from run_experiment import init_common from run_model_experiment import run_model from run_selector_model_experiment import run_selector_model from run_frn_model_experiment import run_frn_model from run_extender_selector_model_experiment import run_extender_selector_model from run_extender_frn_model_experiment import run_extender_frn_model from run_boruta_model_experiment import run_boruta_model from run_extender_robust_model_experiment import run_extender_robust_model if __name__ == "__main__": drug = get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])] init_common() processes = list() processes.append(Process(target=run_model, args=(drug,))) processes.append(Process(target=run_frn_model, args=(drug,))) processes.append(Process(target=run_selector_model, args=(drug,))) processes.append(Process(target=run_extender_frn_model, args=(drug,))) processes.append(Process(target=run_extender_selector_model, args=(drug,))) #processes.append(Process(target=run_boruta_model_experiment, args=(drug,))) processes.append(Process(target=run_extender_robust_model, args=(drug,))) for process in processes: process.start() for process in processes: process.join() print "done, ", time.time() - start_time