def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(test_csv, self.config) result = { "line_id": list(df["line_id"]), "prediction": [], } def chunker(seq, size): return (seq[pos:pos+size] for pos in range(0, len(seq), size)) for chunk in chunker(df, 100000): X = chunk.copy() preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.sort_values("line_id", inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config["model"] = {} self.config["ensemble"] = {"lgb": 1} self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) # load holiday path_holiday = './holiday.csv' holiday = pd.read_csv(path_holiday, \ encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values self.config['holiday'] = set(holiday) df = read_df(train_csv, self.config) print(df.shape) holiday_detect(df, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config)
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config)
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config['stages_time'] = {} self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp") os.makedirs(self.config.tmp_dir, exist_ok=True) start_time = time.time() df = read_df(train_csv, self.config) stage_time_inc(self.config, start_time, 'train read_df') pipeline(df, self.config) if self.config.verbose: self.stages_time_print()
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) ## prepare data df = read_df(train_csv, self.config) ## preprecessing preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) log('drop target') log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) log('################## after FE #########################') log(X.shape) log('#####################################################') train(X, y, self.config)
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config[ "objective"] = "regression" if mode == "regression" else "binary" self.config["metric"] = "rmse" if mode == "regression" else "auc" self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) df = preprocess(df, self.config) y = df["target"].copy() X = df.drop("target", axis=1).copy() del df gc.collect() self.config["columns"] = list(X) train(X, y, self.config)