Beispiel #1
0
    def fit(self, X, y, test_data=None):
        # create tmp dir to hold data and model (especially the latter)
        tmp_dir = tempfile.mkdtemp()
        issparse = sps.issparse(X)
        f_format = "svm" if issparse else "csv"

        train_filepath = os.path.abspath("{}/X.{}".format(tmp_dir, f_format))
        io_utils.dump_data(X, y, train_filepath, issparse)

        if test_data:
            valid = []
            for i, (x_test, y_test) in enumerate(test_data):
                test_filepath = os.path.abspath("{}/X{}_test.{}".format(
                    tmp_dir, i, f_format))
                valid.append(test_filepath)
                io_utils.dump_data(x_test, y_test, test_filepath, issparse)
            self.param['valid'] = ",".join(valid)

        self.param['task'] = 'train'
        self.param['data'] = train_filepath
        self.param['output_model'] = os.path.join(tmp_dir,
                                                  "LightGBM_model.txt")

        calls = ["{}={}\n".format(k, self.param[k]) for k in self.param]

        if self.config == "":
            conf_filepath = os.path.join(tmp_dir, "train.conf")
            with open(conf_filepath, 'w') as f:
                f.writelines(calls)

            process = subprocess.Popen(
                [self.exec_path, "config={}".format(conf_filepath)],
                stdout=subprocess.PIPE,
                bufsize=1)

        else:
            process = subprocess.Popen(
                [self.exec_path, "config={}".format(self.config)],
                stdout=subprocess.PIPE,
                bufsize=1)

        with process.stdout:
            for line in iter(process.stdout.readline, b''):
                print(line.strip().decode('utf-8')) if self.verbose else None
        # wait for the subprocess to exit
        process.wait()

        with open(self.param['output_model'], mode='r') as file:
            self.model = file.read()
        shutil.rmtree(tmp_dir)

        if test_data and self.param['early_stopping_round'] > 0:
            self.best_round = max(
                map(int, re.findall("Tree=(\d+)", self.model))) + 1
Beispiel #2
0
    def predict_proba(self, X):
        tmp_dir = tempfile.mkdtemp()
        issparse = sps.issparse(X)
        f_format = "svm" if issparse else "csv"

        predict_filepath = os.path.abspath(
            os.path.join(tmp_dir, "X_to_pred.{}".format(f_format)))
        output_model = os.path.abspath(os.path.join(tmp_dir, "model"))
        conf_filepath = os.path.join(tmp_dir, "predict.conf")
        output_results = os.path.abspath(
            os.path.join(tmp_dir, "LightGBM_predict_result.txt"))

        with open(output_model, mode="w") as file:
            file.write(self.model)

        io_utils.dump_data(X, np.zeros(X.shape[0]), predict_filepath, issparse)

        calls = [
            "task = predict\n", "data = {}\n".format(predict_filepath),
            "input_model = {}\n".format(output_model),
            "output_result={}\n".format(output_results)
        ]

        with open(conf_filepath, 'w') as f:
            f.writelines(calls)

        process = subprocess.Popen(
            [self.exec_path, "config={}".format(conf_filepath)],
            stdout=subprocess.PIPE)

        if self.verbose:
            while process.poll() is None:
                #line = process.stdout.readline()
                line = process.communicate()[0]  # Martin Kersner
                print(line.strip().decode('utf-8'))
        else:
            process.communicate()

        raw_probabilities = np.loadtxt(output_results, dtype=float)

        if self.param['application'] == 'multiclass':
            y_prob = raw_probabilities

        elif self.param['application'] == 'binary':
            probability_of_one = raw_probabilities
            probability_of_zero = 1 - probability_of_one
            y_prob = np.transpose(
                np.vstack((probability_of_zero, probability_of_one)))
        else:
            raise

        shutil.rmtree(tmp_dir)
        return y_prob
Beispiel #3
0
    def predict(self, X):
        tmp_dir = tempfile.mkdtemp()
        issparse = sps.issparse(X)
        f_format = "svm" if issparse else "csv"

        predict_filepath = os.path.abspath(
            os.path.join(tmp_dir, "X_to_pred.{}".format(f_format)))
        output_model = os.path.abspath(os.path.join(tmp_dir, "model"))
        output_results = os.path.abspath(
            os.path.join(tmp_dir, "LightGBM_predict_result.txt"))
        conf_filepath = os.path.join(tmp_dir, "predict.conf")

        with open(output_model, mode="w") as file:
            file.write(self.model)

        io_utils.dump_data(X, np.zeros(X.shape[0]), predict_filepath, issparse)

        calls = [
            "task = predict\n", "data = {}\n".format(predict_filepath),
            "input_model = {}\n".format(output_model),
            "output_result={}\n".format(output_results)
        ]

        with open(conf_filepath, 'w') as f:
            f.writelines(calls)

        process = subprocess.Popen(
            [self.exec_path, "config={}".format(conf_filepath)],
            stdout=subprocess.PIPE)

        if self.verbose:
            while process.poll() is None:
                line = process.stdout.readline()
                print(line.strip().decode('utf-8'))
        else:
            process.communicate()

        y_pred = np.loadtxt(output_results, dtype=float)
        shutil.rmtree(tmp_dir)

        return y_pred