def split_data(df): label_col = get_label_cols() X = df.drop(label_col, axis=1) y = df[label_col] random_state = get_config('global_parameters.yaml', 'random_state') test_size = get_config('global_parameters.yaml', 'test_size') stratify = get_config('global_parameters.yaml', 'stratify') if stratify: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test
def model_param_search(search_type, model_name): param_path = 'global_parameters.yaml' cv_name = get_config(param_path, 'cv') cv_n_split = get_config(param_path, 'cv_n_split') cv_n_repeats = get_config(param_path, 'cv_n_repeats') n_jobs = get_config(param_path, 'n_jobs') verbose = get_config(param_path, 'verbose') rnd_src_n_iter = get_config(param_path, 'rnd_src_n_iter') random_state = get_config(param_path, 'random_state') params = model_param_selector(model_name) model = get_model(model_name) cv = select_cv(cv_name, cv_n_split, cv_n_repeats, random_state) return getattr(model_selection, search_type)(model, params, cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state, n_iter=rnd_src_n_iter) #def model_train()
def save_to_mongo(estimator, score_func, score, test_score, created_on): connection = Connect.get_connection() project_name = get_config('project_information.yaml', 'project_name') project_type = get_config('project_information.yaml', 'project_type') col_name = get_config('project_information.yaml', 'col_name') db = getattr(connection, project_name) col = getattr(db, col_name) log_dict = dict() log_dict['project_name'] = project_name log_dict['project_type'] = project_type log_dict['estimator'] = estimator log_dict['score_func'] = score_func log_dict['score'] = score log_dict['test_score'] = test_score log_dict['created_on'] = created_on return col.insert_one(log_dict)
from src.data.categorize_data import get_cat_cols from src.data.categorize_data import get_drop_cols from src.data.load_data import separated_initial_data_load from src.utils.get_config import get_config print(get_cat_cols()) print(get_drop_cols()) train_values_path = get_config('file_paths.yaml', 'train_values') train_labels_path = get_config('file_paths.yaml', 'train_labels') print(separated_initial_data_load(train_labels_path, train_labels_path))
from src.utils.save_to_mongo import save_to_mongo from src.utils.calc_score import calc_score from src.utils.get_sample import sampling from src.data.load_data import separated_initial_data_load from src.data.categorize_data import get_cat_cols from src.features.drop_cols import drop_cols from src.features.split_data import split_data from src.features.encoding import select_encoder from src.features.scaling import select_scaler from src.features.feature_selector import feature_selector from src.features.fitting import fitting from src.features.transforming import transforming from src.features.decompositioning import decompositioning from src.models.train_model import model_param_search train_values_path = get_config('file_paths.yaml', 'train_values') train_labels_path = get_config('file_paths.yaml', 'train_labels') scoring_name = get_config('global_parameters.yaml', 'scoring_func') encoder_name = get_config('global_parameters.yaml', 'encoder') scaler_name = get_config('global_parameters.yaml', 'scaler') feature_selector_name = get_config('global_parameters.yaml', 'feature_selector') model_name = get_config('global_parameters.yaml', 'model') search_type = get_config('global_parameters.yaml', 'search_type') cat_cols = get_cat_cols() # preparation df = separated_initial_data_load(train_values_path, train_labels_path) df_dropped = drop_cols(df) sample_data = sampling(df_dropped)
def get_drop_cols(): return get_config('data_categorization.yaml', 'drop_cols')
def get_label_cols(): return get_config('data_categorization.yaml', 'label_cols')
def sampling(df): random_state = get_config('global_parameters.yaml', 'random_state') sample_data_size = get_config('global_parameters.yaml', 'sample_data_size') return df.sample(n=sample_data_size, random_state=random_state)
from sklearn.tree import DecisionTreeClassifier from src.utils.get_config import get_config random_state = get_config("global_parameters.yaml", "random_state") def get_model(model_name): if model_name == 'dtc': # TODO global variable needs to change return DecisionTreeClassifier(random_state=random_state) else: # TODO other models should be added print("Model couldn't find.")