def load_save(config_path): config = read_params(config_path) df = get_data(config_path) df = df.drop(['id', 'Unnamed: 32'], axis=1) df['diagnosis'] = pd.get_dummies(df['diagnosis'], drop_first=True) raw_data_path = config["load_data"]["raw_data_csv"] df.to_csv(raw_data_path, sep=',', index=False, encoding='utf-8')
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] random_state = config["base"]["random_state"] model_dir = config["model_dir"] alpha = config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target = [config["base"]["target_col"]] train = pd.read_csv(train_data_path, sep = ",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) lr = ElasticNet( alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" %r2) ################################################################################# score_file = config["reports"]["scores"] params_file = config["reports"]["params"] with open(score_file, "w") as f: scores = { "rmse": rmse, "mae": mae, "r2": r2 } json.dump(scores, f, indent=4) with open(params_file, "w") as f: scores = { "alpha": alpha, "l1_rate": l1_ratio } json.dump(scores, f, indent=4) ################################################################################# os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(lr, model_path)
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] model_dir = config["model_dir"] file_object = open('Training_log.txt', 'a+') logger = App_Logger() df = pd.read_csv(train_data_path) #Reading the processed dataset df["date"] = pd.to_datetime(df["date"]).dt.date X_train = df[df['date'] <= datetime.date( 2017, 5, 31)] #splitting the dataset based on date for trainging data val_X = df[df['date'] > datetime.date( 2017, 5, 31)] #spliting the dataset based on date for validation data logger.log(file_object, "Splitting dataset completed") X_train = X_train.drop(['date'], axis=1) val_X = val_X.drop(['date'], axis=1) y_train = np.log1p((X_train["transactionRevenue"]).values) val_y = np.log1p((val_X["transactionRevenue"]).values) logger.log(file_object, "Log transformation of transaction Revenue values completed") x1 = X_train.drop(['transactionRevenue'], axis=1) val_x1 = val_X.drop(['transactionRevenue'], axis=1) y_train = pd.DataFrame(y_train) val_y = pd.DataFrame(val_y) ################## MLFLOW ###################### mlflow_config = config["mlflow_config"] remote_server_uri = mlflow_config['remote_server_uri'] mlflow.set_tracking_uri(remote_server_uri) mlflow.set_experiment(mlflow_config["experiment_name"]) with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run: model_xgb = run_xgb(x1, y_train) y_train_predict = model_xgb.predict(x1) rmse, mae, r2 = eval_metrics(y_train, y_train_predict) mlflow.log_param("n_estimators", 1200) mlflow.log_param("learning_rate", 0.5) mlflow.log_param("max_depth", 8) mlflow.log_metric('rmse', rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model( model_xgb, "model", registered_model_name=mlflow_config["registered_model_name"]) else: mlflow.sklearn.load_model(model_xgb, "model") ##################### Saving the model as pickle file ################################ logger.log(file_object, "Model file created successfully") file_object.close()
def load_and_save(config_path): config = read_params(config_path) df = get_data(config_path) new_columns = [col.replace(' ', '_') for col in df.columns] raw_data_path = config['load_data']['raw_dataset_csv'] df.to_csv(raw_data_path, sep=',', index=False, header=new_columns)
def load_and_save(config_path): config = read_params(config_path) train_df, test_df = get_data(config_path) raw_train_data_path = config["load_data"]["raw_train_data_csv"] raw_test_data_path = config["load_data"]["raw_test_data_csv"] train_df.to_csv(raw_train_data_path, index=False) test_df.to_csv(raw_test_data_path, index=False)
def train_evaluate(config_file): config = read_params(config_file) train_path = config['split_data']['train_path'] test_path = config['split_data']['test_path'] target = config['base']['target_col'] train = pd.read_csv(train_path) test = pd.read_csv(test_path) train_X, train_Y = train.loc[:, train.columns != target], train[target] test_X, test_Y = test.loc[:, test.columns != target], test[target] random_state = config['base']['random_state'] alpha = config['estimators']['ElasticNet']['params']['alpha'] l1_ratio = config['estimators']['ElasticNet']['params']['l1_ratio'] model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) model.fit(train_X, train_Y) predictions = model.predict(test_X) model_dir = config['model_dir'] joblib.dump(model, os.path.join(model_dir, 'model.joblib')) (rmse, mae, r2) = evaluate_metrics(test_Y, predictions) print("RMSE: ", rmse) print("MAE: ", mae) print("R2: ", r2) scores_file = config['report']['scores'] with open(scores_file, "w") as f: scores = {"RMSE": rmse, "MAE": mae, "R2_Score": r2} json.dump(scores, f, indent=4)
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] random_state=config["base"]["random_state"] model_dir=config["model_dir"] alpha=config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio=config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target=config["base"]["target_col"] train = pd.read_csv(train_data_path, sep=",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y =test[target] train_x =train.drop(target, axis=1) test_x =test.drop(target, axis=1) lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x,train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
def load_and_save(config_path): config = read_params(config_path) df = get_data(config_path) new_cols = [col.replace(" ", "_") for col in df.columns] #print(new_cols) raw_data_path = config["load_data"]["raw_dataset_csv"] df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
def load_and_save(config_path): config = read_params(config_path) df = get_data(config_path) #Replace spaces with _ new_cols = [col.replace(" ", "_") for col in df.columns] raw_data_path = config["load_data"]["raw_dataset_csv"] #Get data to csv file df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
def load_and_save(config_path): print("=== Loading Files ===") config = read_params(config_path) df = get_data(config_path) new_cols = [col.replace(" ", "_") for col in df.columns] raw_data_path = config["load_data"]["raw_dataset_csv"] df.to_csv(raw_data_path, sep=",", index=False, header=new_cols) print("=== Data Loaded and Saved in data/raw ===")
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] random_state = config["base"]["random_state"] model_dir = config["model_dir"] alpha = config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target = [config["base"]["target_col"]] train = pd.read_csv(train_data_path, sep=",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) ################### MLFLOW ############################### mlflow_config = config["mlflow_config"] remote_server_uri = mlflow_config["remote_server_uri"] mlflow.set_tracking_uri(remote_server_uri) mlflow.set_experiment(mlflow_config["experiment_name"]) with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run: lr = ElasticNet( alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model( lr, "model", registered_model_name=mlflow_config["registered_model_name"]) else: mlflow.sklearn.load_model(lr, "model")
def split_and_saved_data(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] raw_data_path = config["load_data"]["raw_dataset_csv"] split_ratio = config["split_data"]["test_size"] random_state = config["base"]["random_state"] df = pd.read_csv(raw_data_path, sep=",") train, test = train_test_split(df, test_size=split_ratio, random_state=random_state) train.to_csv(train_data_path, sep=",", index=False, encoding='utf-8') test.to_csv(test_data_path, sep=",", index=False, encoding='utf-8')
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["preprocess"]["test_path"] train_data_path = config["preprocess"]["train_path"] model_dir = config["model_dir"] c = config["estimators"]["SVC"]["params"]["C"] gamma = config["estimators"]["SVC"]["params"]["gamma"] target = [config["base"]["target_col"]] train = pd.read_csv(train_data_path, sep=",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) svc = SVC(C=c, gamma=gamma) svc.fit(train_x, train_y) predicted_attrition = svc.predict(test_x) (Recall, Precision, F1_Score, AUC) = eval_metrics(test_y, predicted_attrition) print("SVC model (C=%f, gamma=%f):" % (c, gamma)) print(" Recall: %s" % Recall) print(" Precision: %s" % Precision) print(" F1_Score: %s" % F1_Score) print(" AUC: %s" % AUC) scores_file = config["reports"]["scores"] params_file = config["reports"]["params"] with open(scores_file, "w") as f: scores = { "Recall": Recall, "Precision": Precision, "F1_Score": F1_Score, "AUC": AUC } json.dump(scores, f, indent=4) with open(params_file, "w") as f: params = {"C": c, "Gamma": gamma} json.dump(params, f, indent=4) os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(svc, model_path)
def split_XY(config_path): config = read_params(config_path) filter_data_path = config["filter_data"]["filter_data_csv"] df = pd.read_csv(filter_data_path, sep=',') train_data_path = config["split_data"]["train_path"] test_data_path = config["split_data"]["test_path"] random_state = config["base"]["random_state"] split_ratio = config["split_data"]["test_size"] train, test = train_test_split(df, test_size=split_ratio, random_state=random_state) train.to_csv(train_data_path, sep=',', index=False, encoding='utf-8') test.to_csv(test_data_path, sep=',', index=False, encoding='utf-8')
def split_and_saved_data(config_path): config = read_params(config_path) test_data_path = config['split_data']['test_path'] train_data_path = config['split_data']['train_path'] raw_dataset_csv = config['load_data']['raw_dataset_csv'] split_ratio = config['split_data']['test_size'] random_state = config['base']['random_state'] dataset = pd.read_csv(raw_dataset_csv) train_data, test_data = train_test_split(dataset, test_size=split_ratio, random_state=random_state) train_data.to_csv(train_data_path, index=False, sep=",") test_data.to_csv(test_data_path, index=False, sep=",")
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path=config["split_data"]["test_path"] train_data_path=config["split_data"]["train_path"] random_state=config["base"]["random_state"] model_dir=config["model_dir"] alpha=config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio=config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target=config["base"]["target_col"] train = pd.read_csv(train_data_path,sep=",") test = pd.read_csv(test_data_path,sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target,axis=1) test_x = test.drop(target,axis=1) test_y = test[target] lr=ElasticNet(alpha=alpha,l1_ratio=l1_ratio,random_state=random_state) lr.fit(train_x,train_y) predicted_values=lr.predict(test_x) (rmse,mae,r2)=eval_metrics(test_y,predicted_values) print("Alpha Ratio: %s):"% alpha) print("L1 Ratio: %s):"% l1_ratio) scores_file=config["reports"]["scores"] params_file=config["reports"]["params"] with open(scores_file,"w") as f: scores={ "rmse":rmse, "mae":mae, "r2":r2 } json.dump(scores, f, indent=4) with open(params_file,"w") as f: params={ "alpha":alpha, "l1_ratio":l1_ratio } #4 spaces json.dump(params, f, indent=4) os.makedirs(model_dir, exist_ok=True)
def split_and_saved_data(config_path): config = read_params(config_path) test_data_path = config['split_data']['test_path'] train_data_path = config['split_data']['train_path'] raw_data_path = config['load_data']['raw_dataset_csv'] split_ratio = config['split_data']['test_size'] random_state = config['base']['random_state'] df = pd.read_csv(raw_data_path, sep=',') train, test = train_test_split(df, test_size=split_ratio, random_state=random_state) train.to_csv(train_data_path, sep=',', index=False) test.to_csv(test_data_path, sep=',', index=False)
def split_and_saved_data(config_path): config = read_params(config_path) # get all the path form params.yaml file # here we need the path for storing the train and test data test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] raw_data_path = config["load_data"]["raw_dataset_csv"] split_ratio = config["split_data"]["test_size"] random_state = config["base"]["random_state"] df = pd.read_csv(raw_data_path, sep=",") # read raw data path train, test = train_test_split(df,test_size=split_ratio, random_state = random_state) train.to_csv(train_data_path, sep=",",index=False, encoding='utf-8') # save train data into file test.to_csv(test_data_path, sep=",",index=False, encoding='utf-8') # save test data into file
def load_n_save(config_path): ''' loads the data from the config_path using the functions from get_data.py file and saves to the data folder ''' config = read_params(config_path) df = getData(config_path) # a liitle preprocessing for changing the name of the columns # because the names have spaces between them which can cause issues # in the future up_cols = [col.replace(" ","_") for col in df.columns] raw_path = config["load_data"]["raw_dataset"] df.to_csv(raw_path, sep=",", index=False, header=up_cols)
def training_evaluation(config_path): config = read_params(config_path) train_path = config["split_data"]["train_path"] test_path = config["split_data"]["test_path"] random_state = config["base"]["random_state"] alpha = config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"] model_dir = config["model_dir"] target_col = config["base"]["target_col"] train_data = pd.read_csv(train_path, sep=",") test_data = pd.read_csv(test_path, sep=",") X_train = train_data.drop(target_col, axis=1) y_train = train_data[target_col] X_test = test_data.drop(target_col, axis=1) y_test = test_data[target_col] lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(X_train, y_train) predicted_val = lr.predict(X_test) rmse, mae, r2 = evaluate_metrics(y_test, predicted_val) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) scores_file = config["reports"]["scores"] params_file = config["reports"]["params"] with open(scores_file, "w") as f: scores = {"rmse": rmse, "mae": mae, "r2": r2} json.dump(scores, f, indent=4) with open(params_file, "w") as f: params = { "alpha": alpha, "l1_ratio": l1_ratio, } json.dump(params, f, indent=4) os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(lr, model_path)
def split_and_save_data(config_path): config=read_params(config_path) test_data_path=config["split_data"]['test_path'] train_data_path=config["split_data"]['train_path'] raw_data_path=config["load_data"]['raw_dataset_csv'] split_ratio=config["split_data"]['test_size'] random_state=config["base"]['random_state'] df=pd.read_csv(raw_data_path,sep=",") train,test=train_test_split(df, test_size=split_ratio, random_state=random_state) train.to_csv(train_data_path,sep=",",encoding="utf-8") test.to_csv(test_data_path,sep=",",encoding="utf-8")
def train_n_evaluate(config_path): config = read_params(config_path) train_path = config["split_data"]["train_path"] test_path = config["split_data"]["test_path"] random_state = config["base"]["random_state"] model_dir = config["model_dir"] alpha = config["estimators"]["Elastic_Net"]["params"]["alpha"] l1_ratio = config["estimators"]["Elastic_Net"]["params"]["l1_ratio"] target = config["base"]["target_col"] train = pd.read_csv(train_path, sep=",") test = pd.read_csv(test_path, sep=",") y_train = train[target] y_test = test[target] train_data = train.drop(target, axis=1) test_data = test.drop(target, axis=1) model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) model.fit(train_data, y_train) y_pred = model.predict(test_data) (mae, mse, r2) = get_metrics(y_pred, y_test) params_file = config["reports"]["params"] scores_file = config["reports"]["scores"] with open(scores_file, "w") as f: scores = {'mse': mse, 'mae': mae, 'r2': r2} json.dump(scores, f, indent=4) with open(params_file, "w") as f: params = {'alpha': alpha, 'l1_ratio': l1_ratio} json.dump(params, f, indent=4) print(f"ElasticNet model(aplha={alpha}, l1_ratio={l1_ratio})") print(f"MSE: {mse}") print(f"MAE: {mae}") print(f"R2: {r2}") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(model, model_path)
def load_and_save(config_path): config = read_params(config_path) df = get_data(config_path) # there are some gaps between columns in dataset # that maybe create some issues that's why we chance col name new_cols = [col.replace(" ", "_") for col in df.columns] # as we know there are space in beteen thw column name # we simply replace that space with "_" # Ex:- "first name" ---> "first_name" # print(new_cols) # path form patams.yaml for do changes in original file raw_data_path = config["load_data"]["raw_dataset_csv"] # now save that changes in new csv file df.to_csv(raw_data_path, sep=",", index=False, header=new_cols)
def split_and_saved_data(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] raw_data_path = config["load_data"]["raw_dataset_csv"] #split_ratio = ["split_data"]["train_size"] data = convert_to_date(config_path) #train = data[:int(split_ratio * (len(data)))] #test = data[int(split_ratio * (len(data))):] train = data[:int(0.8 * (len(data)))] test = data[int(0.8 * (len(data))):] train.to_csv(train_data_path,encoding='utf-8') test.to_csv(test_data_path,encoding='utf-8')
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config['split_data']['test_path'] train_data_path = config['split_data']['train_path'] random_state = config['base']['random_state'] model_dir = config['model_dir'] alpha = config['estimators']['ElasticNet']['params']['alpha'] l1_ratio = config['estimators']['ElasticNet']['params']['l1_ratio'] target = [config['base']['target_col']] train = pd.read_csv(train_data_path, sep=',') test = pd.read_csv(test_data_path, sep=',') train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) rmse, mae, r2 = eval_matrix(test_y, predicted_qualities) scores_file = config["reports"]["scores"] params_file = config["reports"]["params"] with open(scores_file, "w") as f: scores = {"rmse": rmse, "mae": mae, "r2": r2} json.dump(scores, f, indent=4) with open(params_file, "w") as f: params = { "alpha": alpha, "l1_ratio": l1_ratio, } json.dump(params, f, indent=4) os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(lr, model_path)
def log_production_model(config_path): config = read_params(config_path) mlflow_config = config["mlflow_config"] model_name = mlflow_config["registered_model_name"] remote_server_uri = mlflow_config["remote_server_uri"] mlflow.set_tracking_uri(remote_server_uri) runs = mlflow.search_runs(experiment_ids=1) lowest = runs["metrics.mae"].sort_values(ascending=True)[0] lowest_run_id = runs[runs["metrics.mae"] == lowest]["run_id"][0] client = MlflowClient() for mv in client.search_model_versions(f"name='{model_name}'"): mv = dict(mv) if mv["run_id"] == lowest_run_id: current_version = mv["version"] logged_model = mv["source"] pprint(mv, indent=4) client.transition_model_version_stage( name=model_name, version=current_version, stage="Production" ) else: current_version = mv["version"] client.transition_model_version_stage( name=model_name, version=current_version, stage="Staging" ) loaded_model = mlflow.pyfunc.load_model(logged_model) model_path = config["webapp_model_dir"] # "prediction_service/model" joblib.dump(loaded_model, model_path)
def preprocessing(config_path): config = read_params(config_path) raw_data_path = config["load_data"]["raw_dataset_csv"] preprocessed_data_path = config["preprocess"]["processed_dataset_csv"] curr_year = config["preprocess"]["current_year"] ### This function is used to get CSV data as dataframe df = get_data(config_path) update_df = df[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner' ]] ### As we see that year is appearing in year format but it is not tuthfull, so it need to fit as no of years from current year. update_df['No_Years'] = int(curr_year) - update_df['Year'] update_df.drop(['Year'], axis=1, inplace=True) update_df = pd.get_dummies(update_df, drop_first=True) update_df.to_csv(preprocessed_data_path, sep=",", index=False, encoding="utf-8")
def split_n_save(config_path): ''' Function will split the data into train and test set and will save it in data\processed folder ''' config = read_params(config_path) # Fetching the configurations train_path = config["split_data"]["train_path"] test_path = config["split_data"]["test_path"] raw_path = config["load_data"]["raw_dataset"] test_split = config["split_data"]["test_size"] random_state=config["base"]["random_state"] #Processing and saving print(train_path) print(test_path) df = pd.read_csv(raw_path, sep=",") train,test = train_test_split(df, test_size=test_split, random_state=random_state) train.to_csv(train_path, sep=",", index=False) test.to_csv(test_path, sep=",", index=False)
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] model_dir = config["model_dir"] random_state = config["base"]["random_state"] target = config["base"]["target"] train = pd.read_csv(train_data_path, sep=',') test = pd.read_csv(test_data_path, sep=',') train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) train_y = train[target] test_y = test[target] RF = RandomForestClassifier(random_state=random_state) RF.fit(train_x, train_y) RFPrediction = RF.predict(test_x) accuracy = evaluate_accuracy(test_y, RFPrediction) print("Model accuracy: %s" % accuracy) scores_file = config["report"]["scores"] with open(scores_file, 'w') as f: scores = {"acurracy": accuracy} json.dump(scores, f, indent=4) k, corr_value = feature_select(config_path) params_file = config["report"]["params"] with open(params_file, 'w') as f: params = {"k": k, "correlation": corr_value} json.dump(params, f, indent=4) os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(RF, model_path)
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] random_state = config["base"]["random_state"] model_dir = config["model_dir"] alpha = config["estimators"]["ElasticNet"]["params"]["alpha"] l1_ratio = config["estimators"]["ElasticNet"]["params"]["l1_ratio"] target = [config["base"]["target_col"]] train = pd.read_csv(train_data_path, sep=",") test = pd.read_csv(test_data_path, sep=",") train_y = train[target] test_y = test[target] train_x = train.drop(target, axis=1) test_x = test.drop(target, axis=1) lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print(f"Elasticnet model (alpha: {alpha} l1_ratio: {l1_ratio})") print(f" RMSE: {rmse}") print(f" MAE: {mae}") print(f" R2: {r2}") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "model.joblib") joblib.dump(lr, model_path)