def split_data(df):
    label_col = get_label_cols()
    X = df.drop(label_col, axis=1)
    y = df[label_col]

    random_state = get_config('global_parameters.yaml', 'random_state')
    test_size = get_config('global_parameters.yaml', 'test_size')
    stratify = get_config('global_parameters.yaml', 'stratify')
    if stratify:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state,
                                                            stratify=y)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test
Ejemplo n.º 2
0
def model_param_search(search_type, model_name):
    param_path = 'global_parameters.yaml'

    cv_name = get_config(param_path, 'cv')
    cv_n_split = get_config(param_path, 'cv_n_split')
    cv_n_repeats = get_config(param_path, 'cv_n_repeats')
    n_jobs = get_config(param_path, 'n_jobs')
    verbose = get_config(param_path, 'verbose')
    rnd_src_n_iter = get_config(param_path, 'rnd_src_n_iter')
    random_state = get_config(param_path, 'random_state')

    params = model_param_selector(model_name)
    model = get_model(model_name)

    cv = select_cv(cv_name, cv_n_split, cv_n_repeats, random_state)

    return getattr(model_selection, search_type)(model,
                                                 params,
                                                 cv=cv,
                                                 verbose=verbose,
                                                 n_jobs=n_jobs,
                                                 random_state=random_state,
                                                 n_iter=rnd_src_n_iter)


#def model_train()
def save_to_mongo(estimator, score_func, score, test_score, created_on):
    connection = Connect.get_connection()
    project_name = get_config('project_information.yaml', 'project_name')
    project_type = get_config('project_information.yaml', 'project_type')
    col_name = get_config('project_information.yaml', 'col_name')

    db = getattr(connection, project_name)
    col = getattr(db, col_name)

    log_dict = dict()
    log_dict['project_name'] = project_name
    log_dict['project_type'] = project_type
    log_dict['estimator'] = estimator
    log_dict['score_func'] = score_func
    log_dict['score'] = score
    log_dict['test_score'] = test_score
    log_dict['created_on'] = created_on

    return col.insert_one(log_dict)
Ejemplo n.º 4
0
from src.data.categorize_data import get_cat_cols
from src.data.categorize_data import get_drop_cols
from src.data.load_data import separated_initial_data_load
from src.utils.get_config import get_config


print(get_cat_cols())
print(get_drop_cols())

train_values_path = get_config('file_paths.yaml', 'train_values')
train_labels_path = get_config('file_paths.yaml', 'train_labels')
print(separated_initial_data_load(train_labels_path, train_labels_path))
from src.utils.save_to_mongo import save_to_mongo
from src.utils.calc_score import calc_score
from src.utils.get_sample import sampling
from src.data.load_data import separated_initial_data_load
from src.data.categorize_data import get_cat_cols
from src.features.drop_cols import drop_cols
from src.features.split_data import split_data
from src.features.encoding import select_encoder
from src.features.scaling import select_scaler
from src.features.feature_selector import feature_selector
from src.features.fitting import fitting
from src.features.transforming import transforming
from src.features.decompositioning import decompositioning
from src.models.train_model import model_param_search

train_values_path = get_config('file_paths.yaml', 'train_values')
train_labels_path = get_config('file_paths.yaml', 'train_labels')
scoring_name = get_config('global_parameters.yaml', 'scoring_func')
encoder_name = get_config('global_parameters.yaml', 'encoder')
scaler_name = get_config('global_parameters.yaml', 'scaler')
feature_selector_name = get_config('global_parameters.yaml',
                                   'feature_selector')
model_name = get_config('global_parameters.yaml', 'model')
search_type = get_config('global_parameters.yaml', 'search_type')
cat_cols = get_cat_cols()

# preparation
df = separated_initial_data_load(train_values_path, train_labels_path)
df_dropped = drop_cols(df)
sample_data = sampling(df_dropped)
Ejemplo n.º 6
0
def get_drop_cols():
    return get_config('data_categorization.yaml', 'drop_cols')
Ejemplo n.º 7
0
def get_label_cols():
    return get_config('data_categorization.yaml', 'label_cols')
def sampling(df):
    random_state = get_config('global_parameters.yaml', 'random_state')
    sample_data_size = get_config('global_parameters.yaml', 'sample_data_size')
    return df.sample(n=sample_data_size, random_state=random_state)
from sklearn.tree import DecisionTreeClassifier

from src.utils.get_config import get_config

random_state = get_config("global_parameters.yaml", "random_state")


def get_model(model_name):
    if model_name == 'dtc':
        # TODO global variable needs to change
        return DecisionTreeClassifier(random_state=random_state)
    else:
        # TODO other models should be added
        print("Model couldn't find.")