def fit(self, X, y, test_data=None): # create tmp dir to hold data and model (especially the latter) tmp_dir = tempfile.mkdtemp() issparse = sps.issparse(X) f_format = "svm" if issparse else "csv" train_filepath = os.path.abspath("{}/X.{}".format(tmp_dir, f_format)) io_utils.dump_data(X, y, train_filepath, issparse) if test_data: valid = [] for i, (x_test, y_test) in enumerate(test_data): test_filepath = os.path.abspath("{}/X{}_test.{}".format( tmp_dir, i, f_format)) valid.append(test_filepath) io_utils.dump_data(x_test, y_test, test_filepath, issparse) self.param['valid'] = ",".join(valid) self.param['task'] = 'train' self.param['data'] = train_filepath self.param['output_model'] = os.path.join(tmp_dir, "LightGBM_model.txt") calls = ["{}={}\n".format(k, self.param[k]) for k in self.param] if self.config == "": conf_filepath = os.path.join(tmp_dir, "train.conf") with open(conf_filepath, 'w') as f: f.writelines(calls) process = subprocess.Popen( [self.exec_path, "config={}".format(conf_filepath)], stdout=subprocess.PIPE, bufsize=1) else: process = subprocess.Popen( [self.exec_path, "config={}".format(self.config)], stdout=subprocess.PIPE, bufsize=1) with process.stdout: for line in iter(process.stdout.readline, b''): print(line.strip().decode('utf-8')) if self.verbose else None # wait for the subprocess to exit process.wait() with open(self.param['output_model'], mode='r') as file: self.model = file.read() shutil.rmtree(tmp_dir) if test_data and self.param['early_stopping_round'] > 0: self.best_round = max( map(int, re.findall("Tree=(\d+)", self.model))) + 1
def predict_proba(self, X): tmp_dir = tempfile.mkdtemp() issparse = sps.issparse(X) f_format = "svm" if issparse else "csv" predict_filepath = os.path.abspath( os.path.join(tmp_dir, "X_to_pred.{}".format(f_format))) output_model = os.path.abspath(os.path.join(tmp_dir, "model")) conf_filepath = os.path.join(tmp_dir, "predict.conf") output_results = os.path.abspath( os.path.join(tmp_dir, "LightGBM_predict_result.txt")) with open(output_model, mode="w") as file: file.write(self.model) io_utils.dump_data(X, np.zeros(X.shape[0]), predict_filepath, issparse) calls = [ "task = predict\n", "data = {}\n".format(predict_filepath), "input_model = {}\n".format(output_model), "output_result={}\n".format(output_results) ] with open(conf_filepath, 'w') as f: f.writelines(calls) process = subprocess.Popen( [self.exec_path, "config={}".format(conf_filepath)], stdout=subprocess.PIPE) if self.verbose: while process.poll() is None: #line = process.stdout.readline() line = process.communicate()[0] # Martin Kersner print(line.strip().decode('utf-8')) else: process.communicate() raw_probabilities = np.loadtxt(output_results, dtype=float) if self.param['application'] == 'multiclass': y_prob = raw_probabilities elif self.param['application'] == 'binary': probability_of_one = raw_probabilities probability_of_zero = 1 - probability_of_one y_prob = np.transpose( np.vstack((probability_of_zero, probability_of_one))) else: raise shutil.rmtree(tmp_dir) return y_prob
def predict(self, X): tmp_dir = tempfile.mkdtemp() issparse = sps.issparse(X) f_format = "svm" if issparse else "csv" predict_filepath = os.path.abspath( os.path.join(tmp_dir, "X_to_pred.{}".format(f_format))) output_model = os.path.abspath(os.path.join(tmp_dir, "model")) output_results = os.path.abspath( os.path.join(tmp_dir, "LightGBM_predict_result.txt")) conf_filepath = os.path.join(tmp_dir, "predict.conf") with open(output_model, mode="w") as file: file.write(self.model) io_utils.dump_data(X, np.zeros(X.shape[0]), predict_filepath, issparse) calls = [ "task = predict\n", "data = {}\n".format(predict_filepath), "input_model = {}\n".format(output_model), "output_result={}\n".format(output_results) ] with open(conf_filepath, 'w') as f: f.writelines(calls) process = subprocess.Popen( [self.exec_path, "config={}".format(conf_filepath)], stdout=subprocess.PIPE) if self.verbose: while process.poll() is None: line = process.stdout.readline() print(line.strip().decode('utf-8')) else: process.communicate() y_pred = np.loadtxt(output_results, dtype=float) shutil.rmtree(tmp_dir) return y_pred