def cv(X_t, y_t, X_test, y_test): ''' :type dataframes: will be using DMatrix, optimized datastructure offered from xgboost testing new APIs in xgboost with train/cv, to be used for hyperparameter tuning ''' dtrain = DMatrix(X_t, label=y_t) dtest = DMatrix(X_test, label=y_test) params={"objective":"binary:logistic", 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8, 'eta': 0.1, 'alpha': 0.2, 'lambda': 0.2, 'eval_metric':'auc'} num_boost_round = 313 #model.best_iteration + 1 #fit model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dtest,'Test')]) return model '''
def main(): p = get_cli_args(args) x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_train = x_train.todense() x_train = np.concatenate([ x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4] ], 1) x_test = x_test.todense() x_test = np.concatenate( [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]], 1) train_dmatrix = DMatrix(x_train, y_train) test_dmatrix = DMatrix(x_test, y_test) train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)]) test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)]) params = { 'objective': 'rank:pairwise', 'eval_metric': ['error', 'map@1'], 'tree_method': 'exact', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=[(test_dmatrix, 'validation')]) xgb_train_str = items_to_str(_.omit(params, 'objective', 'eval_metric').items(), sort_by=itemgetter(0)) xgb_model.save_model(xgb_train_str + '_model.xgb')
def train_ranking(): train_group_list, train_data_list, train_target_list = data_generation({}) test_group_list, test_data_list, test_target_list = train_group_list, train_data_list, train_target_list eval_group_list, eval_data_list, eval_target_list = train_group_list, train_data_list, train_target_list xgbTrain = DMatrix(np.asmatrix(train_data_list), label=train_target_list) xgbTrain.set_group(train_group_list) xgbEval = DMatrix(np.asmatrix(eval_data_list), label=eval_target_list) xgbEval.set_group(eval_group_list) evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')] rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=50, evals=evallist) rankModel.save_model('xgb.model') loaded_model = xgb.Booster(model_file='xgb.model') xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list) xgbTest.set_group(test_group_list) results = loaded_model.predict(xgbTest) with open('results.txt', mode='w', encoding='utf-8') as f: for item in results: f.write(str(item) + '\n')
def to_dmatrix(data, labels=None): if type(data) in [pd.DataFrame, pd.Series]: if labels != None and type(labels) in [pd.DataFrame, pd.Series]: return DMatrix(data.values, labels.values) else: return DMatrix(data.values) else: if labels != None: return DMatrix(data, labels) else: return DMatrix(data)
def test_dmatrix_creator(self): # This function acts as a pseudo-itertools.chain() def row_tup_iter(data): pdf = pd.DataFrame(data) yield pdf # Standard testing DMatrix creation expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100) expected_labels = np.array([1, 0] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels) data = { "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100, "label": [1, 0] * 100, } output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=False, has_validation=False, has_base_margin=False, ) # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using # the same classifier and making sure the outputs are equal model = XGBClassifier() model.fit(expected_features, expected_labels) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3)) # DMatrix creation with weights expected_weight = np.array([0.2, 0.8] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels, weight=expected_weight) data["weight"] = [0.2, 0.8] * 100 output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=True, has_validation=False, has_base_margin=False, ) model.fit(expected_features, expected_labels, sample_weight=expected_weight) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
def main(): # Import training data x_train, y_train, qid_train = load_svmlight_file("hn.train", query_id=True) x_valid, y_valid, qid_valid = load_svmlight_file("hn.vali", query_id=True) x_test, y_test, qid_test = load_svmlight_file("hn.test", query_id=True) group_train = group_qid(qid_train) group_valid = group_qid(qid_valid) group_test = group_qid(qid_test) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) test_dmatrix.set_group(group_test) # Train Xgboost with basic parameters params = {'objective': 'rank:pairwise', 'eta': 0.1, # 'gamma': 1.0, # 'min_child_weight': 0.1, 'max_depth': 3} params['eval_metric'] = ['ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10'] xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix) data_predict = regroup_results(group_test, pred, y_test) # Testing random sample # Simple debug function that print algolia results and predictions def print_random_sample(line): prevsum = sum(group_test[:line]) print('Algolia clicks are: {}'.format(y_test[prevsum:prevsum + group_test[line]])) print('Predictions are: {}'.format(pred[prevsum:prevsum + group_test[line]])) print('Xgboost clicks are: {}'.format(data_predict[line])) print_random_sample(1) print('> Mean reciprocal rank is : {}'.format( mean_reciprocal_rank(data_predict))) print('> Mean average position is : {}'.format( mean_average_precision(data_predict))) # nDCG for i in [1, 3, 5, 10]: ndcg_ = [] for query in data_predict: ndcg_.append(ndcg_at_k(query, i)) print('> nDCG@{} is : {}'.format(i, pd.Series(ndcg_).mean()))
def train(model_file): # This script demonstrate how to do ranking with xgboost.train x_train, y_train = load_svmlight_file("mq2008.train") x_valid, y_valid = load_svmlight_file("mq2008.vali") x_test, y_test = load_svmlight_file("mq2008.test") group_train = [] with open("mq2008.train.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_train.append(int(line.split("\n")[0])) group_valid = [] with open("mq2008.vali.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_valid.append(int(line.split("\n")[0])) group_test = [] with open("mq2008.test.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) params = { 'objective': 'rank:pairwise', 'eta': 0.01, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 8 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix) xgb_model.dump_model(model_file + ".txt") xgb_model.save_model(model_file) # save figures plt.clf() xgb.plot_importance(xgb_model) plt.savefig('feature_importance.png', dpi=800, format='png')
def _convert_partition_data_to_dmatrix( partition_data_iter, has_weight, has_validation, has_base_margin, dmatrix_kwargs=None, ): # pylint: disable=too-many-locals, unbalanced-tuple-unpacking dmatrix_kwargs = dmatrix_kwargs or {} # if we are not using external storage, we use the standard method of parsing data. train_val_data = _prepare_train_val_data(partition_data_iter, has_weight, has_validation, has_base_margin) if has_validation: ( train_x, train_y, train_w, train_b_m, val_x, val_y, val_w, val_b_m, ) = train_val_data training_dmatrix = DMatrix( data=train_x, label=train_y, weight=train_w, base_margin=train_b_m, **dmatrix_kwargs, ) val_dmatrix = DMatrix( data=val_x, label=val_y, weight=val_w, base_margin=val_b_m, **dmatrix_kwargs, ) return training_dmatrix, val_dmatrix train_x, train_y, train_w, train_b_m = train_val_data training_dmatrix = DMatrix( data=train_x, label=train_y, weight=train_w, base_margin=train_b_m, **dmatrix_kwargs, ) return training_dmatrix
def mknfold(X_train, y_train, nfold, param, evals=(), features=None): ''' Makes n folds in input data. Parameters ---------- X_train : pandas.DataFrame X data to be trained y_train : pandas.DataFrame y data to be trained nfold : int Number of folds in CV. param : dict Booster params evals : list Evaluation metrics to be watches in CV. features : list features selected to be trained Returns ------- ret : list list of CVPack objects containing the dmatrix training, testing, and list of parameters and metrics to use for every fold wt_list : list list of weights for each fold. This is the size of each fold ''' if not features: features = X_train.columns out_idset, wt_list = bin_fold(X_train, nfold) in_idset = [ np.concatenate([out_idset[i] for i in range(nfold) if k != i]) for k in range(nfold) ] evals = list(evals) ret = [] for k in range(nfold): # perform the slicing using the indexes determined by the above methods x_train_snip = X_train.loc[in_idset[k]][features] y_train_snip = X_train.loc[in_idset[k]]['encoded_target'] x_test_snip = X_train.loc[out_idset[k]][features] y_test_snip = X_train.loc[out_idset[k]]['encoded_target'] dtrain = DMatrix(x_train_snip, label=y_train_snip) dtest = DMatrix(x_test_snip, label=y_test_snip) tparam = param plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret, wt_list
def test_external_storage(self): # Instantiating base data (features, labels) features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100) labels = np.array([1, 0] * 100) normal_dmatrix = DMatrix(features, labels) test_dmatrix = DMatrix(features) data = { "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100, "label": [1, 0] * 100, } # Creating the dmatrix based on storage temporary_path = tempfile.mkdtemp() storage_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=False, has_validation=False, has_base_margin=False, ) # Testing without weights normal_booster = worker_train({}, normal_dmatrix) storage_booster = worker_train({}, storage_dmatrix) normal_preds = normal_booster.predict(test_dmatrix) storage_preds = storage_booster.predict(test_dmatrix) self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3)) shutil.rmtree(temporary_path) # Testing weights weights = np.array([0.2, 0.8] * 100) normal_dmatrix = DMatrix(data=features, label=labels, weight=weights) data["weight"] = [0.2, 0.8] * 100 temporary_path = tempfile.mkdtemp() storage_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=True, has_validation=False, has_base_margin=False, ) normal_booster = worker_train({}, normal_dmatrix) storage_booster = worker_train({}, storage_dmatrix) normal_preds = normal_booster.predict(test_dmatrix) storage_preds = storage_booster.predict(test_dmatrix) self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3)) shutil.rmtree(temporary_path)
def __classify(self, path): files = [self.parse_pe(path)] df = pd.DataFrame(files) df = df.drop(['sha256', 'size'], axis=1) sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series) imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series) imports = imports.reset_index().set_index(['index', 'dll']) imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index') join = sections.join(imports).fillna(0) join['SectionName'] = join['SectionName'].astype('str') join['dll'] = join['dll'].astype('str') join['import'] = join['import'].astype('str') string_columns = ['SectionName', 'dll', 'import'] matrix = self.ohe.transform(join[string_columns]) index = join.index rows = [] for i in index.unique(): select = index.slice_indexer(start=i, end=i) rows.append(csr_matrix(matrix[select].sum(axis=0))) join_encoded = pd.DataFrame(data={'matrix':rows}) df = df.drop(['sections', 'import'], axis=1) df = df.join(join_encoded) X = df.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1) X = hstack(X.values).T X = X.todok().toarray() return self.booster.predict(DMatrix(X))[0]
def trainModel(self, train_x, train_y): #train a xgboost model self.xgb_clf = xgb.XGBClassifier(nthread=self.xgb_nthread) self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print 'XGB_train eval_result:', xgb_eval_result train_x_mat = DMatrix(train_x) train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) self.one_hot_encoder = OneHotEncoder() train_lr_feature_mat = self.one_hot_encoder.fit_transform( train_xgb_pred_mat) print 'train_mat:', train_lr_feature_mat.shape #train a LR model self.lr_clf = LR() self.lr_clf.fit(train_lr_feature_mat, train_y) self.init_flag = True pickle.dump(self.xgb_clf, file(self.xgb_model_name, 'wb'), True) pickle.dump(self.lr_clf, file(self.lr_model_name, 'wb'), True) pickle.dump(self.one_hot_encoder, file(self.one_hot_encoder_model_name, 'wb'), True) print 'Train xgboost and lr model done'
def get_xgb_dmatrix(tup): from xgboost import DMatrix data, label, weight, missing, feature_names, feature_types = tup return DMatrix(data, label=label, missing=missing, weight=weight, feature_names=feature_names, feature_types=feature_types, nthread=-1)
def predict(self, smiles, get_features=get_fp, use_tqdm=False): canonical_smiles = [] invalid_smiles = [] if use_tqdm: pbar = tqdm(range(len(smiles))) else: pbar = range(len(smiles)) for i in pbar: sm = smiles[i] if use_tqdm: pbar.set_description("Calculating predictions...") try: sm = Chem.MolToSmiles(Chem.MolFromSmiles(sm, sanitize=False)) if len(sm) == 0: invalid_smiles.append(sm) else: canonical_smiles.append(sm) except: invalid_smiles.append(sm) if len(canonical_smiles) == 0: return canonical_smiles, [], invalid_smiles prediction = [] x, _, _ = get_features(canonical_smiles, sanitize=False) x = DMatrix(x) for i in range(len(self.models)): y_pred = self.models[i].predict(x) if self.transformer is not None: y_pred = self.transformer.inverse_transform(y_pred) prediction.append(y_pred) prediction = np.array(prediction) prediction = np.mean(prediction, axis=0) return canonical_smiles, prediction, invalid_smiles
def Predict(self, request: predict_pb2.PredictRequest, context: grpc.RpcContext): model_name = request.model_spec.name if model_name not in self.model_map: raise PythieServingException( f'Unknown model: {model_name}. This pythie-serving instance can only ' f'serve one of the following: {",".join(self.model_map.keys())}' ) model_dict = self.model_map[model_name] features_names = model_dict['feature_names'] feature_rows = [] for feature_name in features_names: if feature_name not in request.inputs: raise PythieServingException( f'{feature_name} not set in the predict request') nd_array = make_ndarray_from_tensor(request.inputs[feature_name]) if len(nd_array.shape) != 2 or nd_array.shape[1] != 1: raise PythieServingException( 'All input vectors should be 1D tensor') feature_rows.append(nd_array) if len(set(len(l) for l in feature_rows)) != 1: raise PythieServingException( 'All input vectors should have the same length') model = model_dict['model'] d_matrix = DMatrix(np.concatenate(feature_rows, axis=1), feature_names=features_names) outputs = model.predict(d_matrix, ntree_limit=model.best_ntree_limit) outputs = outputs.reshape((outputs.size, 1)) # return 1D tensor return outputs
def to_xgboost(self, **kwargs): from xgboost import DMatrix dmatrix = DMatrix(self.to_numpy(**kwargs)) ## TODO: Uncomment when XGB observation models are implemented # dmatrix.observation_model = self.observation_model(backend="xgboost", loss="mse") return dmatrix
def update(self, Xtrain, ytrain, Xval, yval, scoring, n_iterations): dtrain = DMatrix(data=Xtrain, label=ytrain) early_stop_callback = early_stop() if not (self.env['earlier_stop']): for i in range(n_iterations - self.model.n_estimators): # note: # this is a get, but the internal booster in XGBClassifier is also updated # add unit test for controle if future updates self.model.get_booster().update( dtrain, iteration=self.model.n_estimators) self.model.n_estimators += 1 score = scoring(self, Xval, yval) if score > self.env['best_score']: self.env['best_score'] = score self.env['best_iteration'] = self.model.n_estimators try: early_stop_callback(env=self.env, score=score, iteration=self.model.n_estimators) except EarlyStopException: print('Update Stopped Earlier! @ {} instead of {}'.format( self.model.n_estimators, n_iterations)) self.env['earlier_stop'] = True break
def test_xgboost_booster_classifier_reg(self): x, y = make_classification(n_classes=2, n_features=5, n_samples=100, random_state=42, n_informative=3) y = y.astype(np.float32) + 0.567 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) data = DMatrix(x_train, label=y_train) model = train( { 'objective': 'reg:squarederror', 'n_estimators': 3, 'min_child_samples': 1 }, data) model_onnx = convert_xgboost( model, 'tree-based classifier', [('input', FloatTensorType([None, x.shape[1]]))]) dump_data_and_model( x_test.astype(np.float32), model, model_onnx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBBoosterReg")
def train_model(self, train_x, train_y): """ train a xgboost model :param train_x: :param train_y: :return: """ self.xgb_clf = xgb.XGBClassifier() self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print('Xgb train eval result:', xgb_eval_result) train_x_mat = DMatrix(train_x) # get boost tree leaf info train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) # begin one-hot encoding self.one_hot_encoder = OneHotEncoder() train_lr_feature_mat = self.one_hot_encoder.fit_transform(train_xgb_pred_mat) print('train_mat:', train_lr_feature_mat.shape) # lr self.lr_clf = LogisticRegression() self.lr_clf.fit(train_lr_feature_mat, train_y) self.init = True # dump xgboost+lr model with open(self.xgb_model_name, 'wb') as f1, open(self.lr_model_name, 'wb') as f2, \ open(self.one_hot_model_name, 'wb') as f3: pickle.dump(self.xgb_clf, f1, True) pickle.dump(self.lr_clf, f2, True) pickle.dump(self.one_hot_encoder, f3, True)
def _prepare_data(self, back_training_feat, thigh_training_feat, back_temp, thigh_temp, labels, samples_pr_window, sampling_freq, train_overlap): back_training_feat = temp_feature_util.segment_acceleration_and_calculate_features(back_training_feat, temp=back_temp, samples_pr_window=samples_pr_window, sampling_frequency=sampling_freq, overlap=train_overlap) thigh_training_feat = temp_feature_util.segment_acceleration_and_calculate_features(thigh_training_feat, temp=thigh_temp, samples_pr_window=samples_pr_window, sampling_frequency=sampling_freq, overlap=train_overlap) labels = temp_feature_util.segment_labels(labels, samples_pr_window=samples_pr_window, overlap=train_overlap) labels = self._one_hot_encode(labels) both_features = np.hstack((back_training_feat, thigh_training_feat)) # We need to convert the dataframe into a DMatrix dmatrix = DMatrix(both_features, label=labels) return dmatrix, labels
def fit(self, train_x, train_y): """ train a xgboost_lr model :param train_x: :param train_y: :return: """ from xgboost import DMatrix self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print('Xgb train eval result:', xgb_eval_result) train_x_mat = DMatrix(train_x) # get boost tree leaf info train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) print(train_xgb_pred_mat) # begin one-hot encoding train_lr_feature_mat = self.one_hot_encoder.fit_transform( train_xgb_pred_mat) print('train_mat:', train_lr_feature_mat.shape) print('train_mat array:', train_lr_feature_mat.toarray()) # lr self.lr_clf.fit(train_lr_feature_mat, train_y) self.init = True model = [self.xgb_clf, self.lr_clf, self.one_hot_encoder] # dump xgboost+lr model with open(self.model_save_path, 'wb') as f: pickle.dump(model, f, True)
def train(self, x, y, model=None): self.bst = xgb.train( params = vars(self.hparams.bst), dtrain = DMatrix(x, label=y), num_boost_round = self.hparams.num_rounds, xgb_model = model )
def train(self, train_x, train_y): """Train a xgboost_lr model Arguments: train_x {[type]} -- [description] train_y {[type]} -- [description] """ self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print("train eval result: ", xgb_eval_result) train_x_mat = DMatrix(train_x) # get boost tree leaf info train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) print(train_xgb_pred_mat) train_lr_feature_mat = self.one_hot_encoder.fit_transform( train_xgb_pred_mat) print('train_mat:', train_lr_feature_mat.shape) print('train_mat array:', train_lr_feature_mat.toarray()) # lr self.lr_clf.fit(train_lr_feature_mat, train_y) self.init = True
def trainModel(self,train_x,train_y): #train a xgboost model sys.stdout.flush() self.xgb_clf = xgb.XGBClassifier(nthread = self.xgb_nthread) self.xgb_clf.fit(train_x,train_y,eval_metric = self.xgb_eval_metric, eval_set = [(train_x,train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print 'XGB_train eval_result:',xgb_eval_result sys.stdout.flush() train_x_mat = DMatrix(train_x) print 'get boost tree leaf info...' train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf = True) print 'get boost tree leaf info done\n' print 'begin one-hot encoding...' self.one_hot_encoder = OneHotEncoder() train_lr_feature_mat = self.one_hot_encoder.fit_transform(train_xgb_pred_mat) print 'one-hot encoding done!\n\n' print 'train_mat:',train_lr_feature_mat.shape sys.stdout.flush() #train a LR model self.lr_clf = LR() self.lr_clf.fit(train_lr_feature_mat,train_y) self.init_flag = True print 'dump xgboost+lr model..' pickle.dump(self.xgb_clf,file(self.xgb_model_name,'wb'),True) pickle.dump(self.lr_clf,file(self.lr_model_name,'wb'),True) pickle.dump(self.one_hot_encoder,file(self.one_hot_encoder_model_name,'wb'),True) print 'Train xgboost and lr model done'
def apply(self, X, ntree_limit=0): """Return the predicted leaf every tree for each sample. Parameters ---------- X : array_like, shape=[n_samples, n_features] Input features matrix. ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). Returns ------- X_leaves : array_like, shape=[n_samples, n_trees] For each datapoint x in X and for each tree, return the index of the leaf x ends up in. Leaves are numbered within ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering. """ sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) X_leaves = self.get_booster().predict(test_dmatrix, pred_leaf=True, ntree_limit=ntree_limit) revert_group_indices = np.arange( len(group_indices))[group_indices.argsort()] X_leaves = X_leaves[revert_group_indices, :] return X_leaves
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1')) x_test, y_test = sklearn.datasets.load_svmlight_file( os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt')) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) prediction_answer = [] for i, p in enumerate(pred): prediction_answer.append( (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def predict(self, x, **kwargs): """ Perform prediction for a batch of inputs. :param x: Test set. :type x: `np.ndarray` :return: Array of predictions of shape `(nb_inputs, nb_classes)`. :rtype: `np.ndarray` """ from xgboost import Booster, XGBClassifier from art.utils import to_categorical # Apply preprocessing x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False) if isinstance(self._model, Booster): from xgboost import DMatrix train_data = DMatrix(x_preprocessed, label=None) predictions = self._model.predict(train_data) y_prediction = np.asarray([line for line in predictions]) if len(y_prediction.shape) == 1: y_prediction = to_categorical(labels=y_prediction, nb_classes=self.nb_classes()) elif isinstance(self._model, XGBClassifier): y_prediction = self._model.predict_proba(x_preprocessed) # Apply postprocessing y_prediction = self._apply_postprocessing(preds=y_prediction, fit=False) return y_prediction
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',') x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) corpus = SimpleCorp.load("codexes_corp_articles", os.path.join(PATH_TO_FILES, "corp")) prediction_answer = [] for p, doc_id in zip( pred, list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)): prediction_answer.append((doc_id, p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def _prediction_feature_weights(xgb, X, feature_names, xgb_feature_names): """ For each target, return score and numpy array with feature weights on this prediction, following an idea from http://blog.datadive.net/interpreting-random-forests/ """ # XGBClassifier does not have pred_leaf argument, so use booster booster = xgb.booster() # type: Booster leaf_ids, = booster.predict(DMatrix(X, missing=xgb.missing), pred_leaf=True) xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)} tree_dumps = booster.get_dump(with_stats=True) assert len(tree_dumps) == len(leaf_ids) target_feature_weights = partial(_target_feature_weights, feature_names=feature_names, xgb_feature_names=xgb_feature_names) n_targets = _xgb_n_targets(xgb) if n_targets > 1: # For multiclass, XGBoost stores dumps and leaf_ids in a 1d array, # so we need to split them. scores_weights = [ target_feature_weights( leaf_ids[target_idx::n_targets], tree_dumps[target_idx::n_targets], ) for target_idx in range(n_targets) ] else: scores_weights = [target_feature_weights(leaf_ids, tree_dumps)] return scores_weights
def score(self, pred_contribs = False): model = self.model.fit_model scoring_data = self.data.modeling_data missing_cols = setdiff(self.model.train_columns, list(scoring_data.columns)) extra_cols = setdiff(list(scoring_data.columns), self.model.train_columns) # print('Missing cols: ' + ', '.join(missing_cols)) # print('Extra cols: ' + ', '.join(extra_cols)) for col in missing_cols: if '__' in col: scoring_data[col] = 0 else: scoring_data[col] = nan try: scoring_data = scoring_data.drop(extra_cols, axis = 1) print('Dropping ' + ', '.join(extra_cols)) except: pass scoring_data = scoring_data[self.model.train_columns] xgb_data = DMatrix(scoring_data, label = self.data.target) if pred_contribs: contribs = model.predict(xgb_data, pred_contribs = True) self.contribs = pd.DataFrame.from_records(contribs, columns = list(scoring_data.columns) + ['bias']) self.preds = model.predict(xgb_data)