Beispiel #1
0
    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(test_csv, self.config)
        result = {
            "line_id": list(df["line_id"]),
            "prediction": [],
        }

        def chunker(seq, size):
            return (seq[pos:pos+size] for pos in range(0, len(seq), size))

        for chunk in chunker(df, 100000):
            X = chunk.copy()
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.sort_values("line_id", inplace=True)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score
Beispiel #2
0
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config["model"] = {}
        self.config["ensemble"] = {"lgb": 1}

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        # load holiday
        path_holiday = './holiday.csv'
        holiday = pd.read_csv(path_holiday, \
                      encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values
        self.config['holiday'] = set(holiday)

        df = read_df(train_csv, self.config)
        print(df.shape)

        holiday_detect(df, self.config)

        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)

        train(X, y, self.config)
Beispiel #3
0
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        preprocess(df, self.config)

        y = df["target"]
        X = df.drop("target", axis=1)
        train(X, y, self.config)
Beispiel #4
0
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config['stages_time'] = {}
        self.config.tmp_dir = os.path.join(self.config.model_dir, "tmp")
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        start_time = time.time()
        df = read_df(train_csv, self.config)
        stage_time_inc(self.config, start_time, 'train read_df')
        
        pipeline(df, self.config)
        
        if self.config.verbose:
            self.stages_time_print()
Beispiel #5
0
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode
        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        ## prepare data
        df = read_df(train_csv, self.config)
        
        ## preprecessing
        preprocess(df, self.config)

        
        y = df["target"]
        X = df.drop("target", axis=1)
        log('drop target')
        log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
        log('################## after FE #########################')
        log(X.shape)
        log('#####################################################')
        train(X, y, self.config)
Beispiel #6
0
    def train(self, train_csv: str, mode: str):
        self.config["task"] = "train"
        self.config["mode"] = mode

        self.config[
            "objective"] = "regression" if mode == "regression" else "binary"
        self.config["metric"] = "rmse" if mode == "regression" else "auc"

        self.config.tmp_dir = self.config.model_dir + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(train_csv, self.config)
        df = preprocess(df, self.config)

        y = df["target"].copy()
        X = df.drop("target", axis=1).copy()
        del df
        gc.collect()

        self.config["columns"] = list(X)

        train(X, y, self.config)