class ModelGenerator(object): def __init__(self, config_path): self.config = open_json(config_path) self.logger = Logger() self.models = self.build_model() def build_model(self): self.logger.log( f" - Building a model [ {self.config['model']['MODEL']} ]") model_Lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.000327, random_state=18)) model_ENet = make_pipeline( RobustScaler(), ElasticNet(alpha=0.00052, l1_ratio=0.70654, random_state=18)) model_GBoost = GradientBoostingRegressor( n_estimators=3000, learning_rate=0.05, max_depth=4, max_features="sqrt", min_samples_leaf=15, min_samples_split=10, loss="huber", random_state=18, ) model_XGB = XGBRegressor( colsample_bylevel=0.9229733609038979, colsample_bynode=0.21481791874780318, colsample_bytree=0.607964318297635, gamma=0.8989889254961725, learning_rate=0.009192310189734834, max_depth=3, n_estimators=3602, reg_alpha=3.185674564163364e-12, reg_lambda=4.95553539265423e-13, seed=18, subsample=0.8381904293270576, verbosity=0, ) model_logistic = LogisticRegression() models = { "Lasso": model_Lasso, "ENet": model_ENet, "GBoost": model_GBoost, "XGBoost": model_XGB, "LogReg": model_logistic, } return models def fit_model(self, dataset, metaset): dataset["valid"] = dataset["train"][:45569] dataset["train"] = dataset["train"][45569:] train_label = dataset["train"][metaset["__target__"]] train_value = dataset["train"].drop(columns=metaset["__target__"]) valid_label = dataset["valid"][metaset["__target__"]] valid_value = dataset["valid"].drop(columns=metaset["__target__"]) predicts = dict() models = self.models def fitting(model, x_train, x_test, y_train, y_test): model.fit(x_train, y_train) y_pred = model.predict(x_test) self.metrics(y_test, y_pred) return y_pred print("FIT - LogReg") predicts["LogReg"] = fitting( model=models["LogReg"], x_train=train_value, x_test=valid_value, y_train=train_label, y_test=valid_label, ) # log_train_predict = ( # predicts["Lasso"] # + predicts["ENet"] # + predicts["GBoost"] # + predicts["XGBoost"] # ) / 4 # train_score = mean_squared_error(train_label, log_train_predict) # print(f"Scoring with train data : {train_score}") def metrics(self, y_test, y_pred): accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) roc_score = roc_auc_score(y_test, y_pred, average="macro") print( f"accr : {accuracy:.2f}, prec : {precision:.2f}, recall : {recall:.2f}" ) print(f"f1 : {f1:.2f}, auc : {roc_score:.2f}")
class DataAnalyzer(object): def __init__(self, config_path, dataset, metaset): self.config = open_json(config_path) self.logger = Logger() self.eda = EDA(self.config["analyzer"]) self.dataset = dataset self.metaset = metaset def analize(self): dataset = self.dataset metaset = self.metaset pd.set_option("display.max_columns", metaset["__ncolumns__"]) pd.set_option("display.width", 1000) self.logger.log( f"DATASET Analysis \n" f" Total Train dataset : {metaset['__nrows__']['train']} \n" f" Total Test dataset : {metaset['__nrows__']['test']} \n" f" Total Columns num : {metaset['__ncolumns__']} \n" f" Target label : {metaset['__target__']} \n" f" Target dtype : {dataset['train'][metaset['__target__']].dtype} \n" ) self.eda.countplot( dataframe=dataset["train"], column=metaset["__target__"], title="Target Label Distributions", ) request_user_input() self.analize_dtype() # 2.1 self.analize_feature() def analize_dtype(self): self.logger.log(" - 2.1 Analize Dtype", level=2) # SHOW INFO print(get_meta_info(self.metaset, self.dataset)) # USER COMMAND answer = ask_boolean("Are there any issues that need to be corrected?") while answer: target_index = request_user_input( f"Please enter the index of the target to be modified.", valid_inputs=range(self.metaset["__ncolumns__"]), skipable=True, default=None, ) if target_index is None: break target_col = self.metaset["__columns__"][int(target_index)] self.convert_dtype(target_col) print(get_meta_info(self.metaset, self.dataset)) answer = ask_boolean( "Are there any issues that need to be corrected?") def analize_dataset(self): metaset = self.metaset dataset = self.dataset self.logger.log( f"DATASET Analysis \n" f" Total Train dataset : {metaset['__nrows__']['train']} \n" f" Total Test dataset : {metaset['__nrows__']['test']} \n" f" Total Columns num : {metaset['__ncolumns__']} \n" f" Target label : {metaset['__target__']} \n" f" [train distribute(percent.)]\n{metaset['__distribution__']['train']} \n" f" [test distribute(percent.)]\n{metaset['__distribution__']['test']} \n" ) request_user_input() for i, col in enumerate(metaset["__columns__"]): col_meta = metaset[col] self.logger.log(f"{col_meta['index']:3d} " f"{col_meta['name']:20} " f"{col_meta['dtype']:10} " f"{col_meta['descript']}") answer = ask_boolean("Are there any issues that need to be corrected?") self.config["options"]["FIX_COLUMN_INFO"] = answer if self.config["options"]["FIX_COLUMN_INFO"] is True: self.analize_feature() def analize_feature(self): self.logger.log("- 2.2 : Check Data Features", level=2) for i, col in enumerate(self.metaset["__columns__"]): col_meta = self.metaset[col] col_data = self.dataset["train"][col] show_col_info(col_meta, col_data) answer = ask_boolean( "Are there any issues that need to be corrected?", default="N") while answer: target = request_user_input( f"Please enter issue [none, dtype]", valid_inputs=["dtype"], skipable=True, default=None, ) if target == "Dtype": self.convert_dtype(col) show_col_info(col_meta, col_data) answer = ask_boolean( "Are there any issues that need to be corrected?", default="N") print(get_meta_info(self.metaset, self.dataset)) return self.dataset def convert_dtype(self, col): right_dtype = request_user_input( f"Please enter right dtype [num-int, num-float, bool, category, datetime]", valid_inputs=[ "num-int", "num-float", "bool", "category", "datetime" ], skipable=True, default=None, ) print(f"you select dtype {right_dtype}") if right_dtype == "Datetime": self.convert_datetime(col) elif right_dtype == "Category": self.convert_category(col) elif right_dtype == "Bool": self.convert_boolean(col) def convert_datetime(self, col): self.dataset["train"][col] = pd.to_datetime(self.dataset["train"][col]) self.metaset[col]["log"].append( f"dtype changed : {self.metaset[col]['dtype']} to Datetime") self.metaset[col]["dtype"] = "Datetime" answer = ask_boolean("Do you want to split datetime?") if answer: metaset, trainset = self.metaset, self.dataset["train"] metaset, trainset[f"{col}_year"] = add_col_info( metaset, trainset[col].dt.year, f"{col}_year") metaset, trainset[f"{col}_month"] = add_col_info( metaset, trainset[col].dt.month, f"{col}_month") metaset, trainset[f"{col}_day"] = add_col_info( metaset, trainset[col].dt.day, f"{col}_day") metaset, trainset[f"{col}_hour"] = add_col_info( metaset, trainset[col].dt.hour, f"{col}_hour") metaset, trainset[f"{col}_dow"] = add_col_info( metaset, trainset[col].dt.day_name(), f"{col}_dow") self.metaset = metaset self.dataset["train"] = trainset def convert_category(self, col): col_meta = self.metaset[col] col_data = self.dataset["train"][col] col_data = col_data.apply(str) col_meta["log"].append( f"dtype changed : {col_meta['dtype']} to Category") col_meta["dtype"] = "Category" col_meta["unique"] = col_data.unique() col_meta["rate"] = (col_data.value_counts(), ) self.metaset[col] = col_meta self.dataset["train"][col] = col_data def convert_boolean(self, col): col_meta = self.metaset[col] col_data = self.dataset["train"][col] col_data = col_data.apply(str) col_meta["log"].append( f"dtype changed : {col_meta['dtype']} to Boolean") col_meta["dtype"] = "Boolean" col_meta["rate"] = col_data.value_counts() self.metaset[col] = col_meta self.dataset["train"][col] = col_data def get_meta_info(self, columns): info = list() for col in columns: col_meta = self.metaset[col] col_info = { "name": col, "dtype": col_meta["dtype"], "desc": col_meta["descript"], } for i in range(1, 6): col_info[f"sample{i}"] = self.dataset["train"][col][i] info.append(col_info) info_df = pd.DataFrame(info) self.logger.log(f" - Dtype \n {info_df}\n\n", level=3) return info_df
def _main_(args): init_logger() logger = Logger() logger.log("Step 0 >> Setting ") logger.log("Step 1 >> Data Preparation") logger.log("- 1 : Data Collection ", level=1) loader = DataLoader(config_path="./config.json") logger.log("- 2 : Data Analization ", level=1) analyzer = DataAnalyzer( config_path="./config.json", dataset=loader.dataset, metaset=loader.metaset, ) analyzer.analize() logger.log("Step 3 >> Model Generation") model_generator = ModelGenerator(config_path="./config.json") models = model_generator.models logger.log("Step 4 >> Data Preprocess") preprocessor = PreProcessor( config_path="./config.json", dataset=analyzer.dataset, metaset=analyzer.metaset, ) # (x_value, x_label), (y_value, y_label) = preprocessor.label_split() logger.log("Step 5 >> Model Evaluation") models = model_generator.fit_model( dataset=analyzer.dataset, metaset=analyzer.metaset, )
class DataLoader(object): """ Data Loader """ def __init__(self, config_path): self.config = open_json(config_path) self.filepath = self.config["dataset"]["filepath"] self.basepath = self.config["dataset"]["dirpath"] self.format = self.config["dataset"]["format"].lower() self.logger = Logger() self.dataset = self.load_dataset() # 1.1 self.metaset = self.load_metaset() # 1.2 def load_dataset(self): """ 1.1 Data Loading """ self.logger.log( f"- 1.1 {self.config['dataset']['category']} type dataset loading .. ", level=2, ) self.filepath = os.path.join(self.basepath, self.filepath) self.logger.log(f"- '{self.filepath}' is now loading...", level=2) dataset = { "train": self.read_csv("train"), "valid": self.read_csv("valid"), "test": self.read_csv("test"), } if dataset["test"] is None: dataset = self.split_dataset(dataset, "train", "test") return dataset def read_csv(self, name): filepath = os.path.join(self.basepath, self.filepath, name + "." + self.format) index_col = self.config["dataset"].get("index", None) index_col = index_col if index_col != "None" else None try: csv_file = open_csv(filepath=filepath, index_col=index_col) self.logger.log(f"- {name:5} data{csv_file.shape} is now loaded", level=3) except FileNotFoundError: csv_file = None return csv_file def split_dataset(self, dataset, origin, target): split_ratio = self.config["dataset"]["split_ratio"] dataset[origin], dataset[target] = train_test_split( dataset[origin], train_size=split_ratio, random_state=42 ) self.logger.log( f"- {origin:5} data{dataset[origin].shape}" f", {target:5} data{dataset[target].shape}" f" (split ratio: {split_ratio})", level=3, ) return dataset def load_metaset(self): """ 1.2 """ self.logger.log(f"- 1.2 Prepare metadata", level=2) def convert_dict(dtype): return { "Int64": "Num_int", "Float64": "Num_float", "object": "Cat", }[dtype.name] metaset = init_set_info(self.config, self.dataset) metaset = self.read_description(metaset) for i, col in enumerate(metaset["__columns__"]): col_data = self.dataset["train"][col].convert_dtypes() metaset[col] = init_col_info(metaset, col_data, col) return metaset def read_description(self, metaset): descfile = self.config["metaset"].get("descpath", None) if descfile is None: return metaset descpath = os.path.join( self.config["dataset"]["dirpath"], self.config["dataset"]["filepath"], descfile, ) try: with open(descpath, "r", newline="\r\n") as desc_file: self.logger.log(f"- '{descpath}' is now loaded", level=3) desc_list = desc_file.read().splitlines() for desc_line in desc_list: col, desc = desc_line.split(":") metaset[col]["descript"] = desc.strip() return metaset except FileNotFoundError as e: self.logger.warn(f"Description File Not Found Error, '{descpath}'") return metaset