def run(self): with timer(self.name): train = table_load(table_name="train", cols=self.depends_on()) test = table_load( table_name="test", cols=[ col for col in self.depends_on() if col not in self.target_cols() ], ) memo = table_load(table_name="memo") train, test, memo = self.create_features(train, test, memo) insert_cols(table_name="train", df=train) insert_cols(table_name="test", df=test) table_write(table_name="memo", df=memo) cv_train_tables = find_table_name( like="cv_train", unlike="stats")["table_name"].tolist() cv_test_tables = find_table_name( "cv_test", unlike="stats")["table_name"].tolist() if len(cv_train_tables) != len(cv_test_tables): raise ValueError("# of cv_train is not equal to # of cv_test!") for n_fold in range(len(cv_train_tables)): train = table_load(table_name=cv_train_tables[n_fold], cols=self.depends_on()) test = table_load( table_name=cv_test_tables[n_fold], cols=[ col for col in self.depends_on() if col not in self.target_cols() ], ) train, test, memo = self.create_features(train, test, memo) insert_cols(table_name=cv_train_tables[n_fold], df=train) insert_cols(table_name=cv_test_tables[n_fold], df=test)
if len(drop_table_names) > 0: exec_query("".join([ "DROP TABLE {};".format(drop_table_name) for drop_table_name in drop_table_names ])) table_names = [] table_names += find_table_name(like="train")["table_name"].to_list() table_names += find_table_name(like="test")["table_name"].to_list() table_names += find_table_name(like="cv_result")["table_name"].to_list() for table_name in tqdm(table_names): df = table_load(table_name=table_name) stats = pd.concat( [ df.dtypes.rename("dtype").astype(str).to_frame(), df.isnull().sum().rename("null_count").to_frame(), df.describe().T.rename(columns={ "25%": "per_25", "50%": "per_50", "75%": "per_75" }), ], axis=1, sort=False, ) table_write( table_name="{}_stats".format(table_name), df=stats, )
import os import pandas as pd from db import exec_query, table_write print("Initializing Database...") # Drop tables if they exist exec_query("DROP TABLE IF EXISTS train;") exec_query("DROP TABLE IF EXISTS test;") exec_query("DROP TABLE IF EXISTS memo;") # Read data train = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/train.csv") test = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/test.csv") memo = pd.read_csv(os.environ["PROJECT_DIR"] + "/input/memo.csv") # Insert train, test data into DB table_write(table_name="train", df=train) table_write(table_name="test", df=test) table_write(table_name="memo", df=memo) # Create Index exec_query("CREATE INDEX train_index on train (index);") exec_query("CREATE INDEX test_index on test (index);") exec_query("CREATE INDEX memo_index on memo (index);") print("Done!!")
categorical_features=categorical_cols, target_cols=target_cols, train_cols=train_cols, params=params, ) valid["survived"] = y_pred y_pred = postprocessing(train=train, test=valid) cv_result = pd.DataFrame({ "index": valid.index, "predicted": y_pred.flatten(), "real": y_real, "difference": y_pred.flatten() - y_real, "difference_abs": abs(y_pred.flatten() - y_real), }) table_write(table_name="cv_result_{}".format(n_fold), df=cv_result) predicted = (y_pred.flatten() > 0.5).astype(int) accuracy = (predicted == y_real).sum() / len(predicted) accuracies.append(accuracy) print("Accuracy: {}".format(accuracy)) print("Total Accuracy: {}".format(np.mean(accuracies)))
from db import table_load, table_write from utils import timer if __name__ == "__main__": if len(sys.argv) == 2: config_file_name = sys.argv[1] else: config_file_name = "lightgbm_0" print("Config file Name: ", config_file_name) with timer("kfold"): config: dict = json.load( open("./configs/{}.json".format(config_file_name))) train = table_load("train") folds = StratifiedKFold( n_splits=config["cv"]["n_splits"], shuffle=True, random_state=config["cv"]["random_state"], ).split(train, train[config["features"]["target"]]) for n_fold, (train_index, valid_index) in enumerate(folds): cv_train_df = train.loc[train_index] cv_test_df = train.loc[valid_index] table_write(table_name="cv_train_{}".format(n_fold), df=cv_train_df) table_write(table_name="cv_test_{}".format(n_fold), df=cv_test_df)