def test_numpy_input(self): t = xgb.XGBoostClassifier(n_estimators=10, learning_rate=0.1, silent=0) X = np.random.randn(6, 4) y = np.array([0, 1, 1, 0, 0, 1]) t.fit(X, y) scores = t.predict_proba(X) print scores
def run_pipeline(events, models): tNameId = bt.Feature_id_transform(min_size=0, exclude_missing=True, zero_based=True, input_feature="name", output_feature="nameId") tAuto = pauto.Auto_transform(max_values_numeric_categorical=2, exclude=["nameId", "name"]) xgb = xg.XGBoostClassifier(target="nameId", target_readable="name", excluded=["name"], learning_rate=0.1, silent=1) cv = cf.Seldon_KFold(xgb, 5) logger.info("cross validation scores %s", cv.get_scores()) transformers = [("tName", tNameId), ("tAuto", tAuto), ("cv", cv)] p = Pipeline(transformers) pw = sutl.Pipeline_wrapper() df = pw.create_dataframe_from_files(events) df2 = p.fit_transform(df) pw.save_pipeline(p, models) logger.info("cross validation scores %s", cv.get_scores())
def test_set_params(self): t = xgb.XGBoostClassifier(target="target", learning_rate=0.1, silent=0, objective='binary:logistic') t.set_params(learning_rate=0.9, gamma=0.1) self.assertEquals(t.get_params()['learning_rate'], 0.9)
def test_svmlight_features(self): t = xgb.XGBoostClassifier(target="target", svmlight_feature="svm", learning_rate=0.1, silent=0, objective='binary:logistic') df = pd.DataFrame([{"svm": [(1, 2.0), (2, 3.0)], "target": 1}]) t.fit(df)
def train(self, sample): tTfidf = ptfidf.Tfidf_transform(input_feature="review", output_feature="tfidf", target_feature="sentiment", min_df=10, max_df=0.7, select_features=False, topn_features=50000, stop_words="english", ngram_range=[1, 2]) tFilter2 = bt.Include_features_transform( included=["tfidf", "sentiment"]) svmTransform = bt.Svmlight_transform(output_feature="svmfeatures", excluded=["sentiment"], zero_based=False) classifier_xg = xg.XGBoostClassifier(target="sentiment", svmlight_feature="svmfeatures", silent=1, max_depth=5, n_estimators=200, objective='binary:logistic', scale_pos_weight=0.2) cv = cf.Seldon_KFold(classifier_xg, metric='auc', save_folds_folder="./folds") transformers = [("tTfidf", tTfidf), ("tFilter2", tFilter2), ("svmTransform", svmTransform), ("cv", cv)] p = Pipeline(transformers) pw = sutl.Pipeline_wrapper() df = pw.create_dataframe_from_files([self.data_folder], df_format="csv") if sample < 1.0: logger.info("sampling dataset to size %s ", sample) df = df.sample(frac=sample, random_state=1) logger.info("Data frame shape %d , %d", df.shape[0], df.shape[1]) df2 = p.fit_transform(df) pw.save_pipeline(p, self.model_folder) logger.info("cross validation scores %s", cv.get_scores()) return p
def test_kfold(self): x = xgb.XGBoostClassifier(target="target", learning_rate=0.1, silent=0, objective='binary:logistic') t = cf.Seldon_KFold(x, 3) f1 = {"target": 0, "b": 1.0, "c": 0} f2 = {"target": 1, "b": 0, "c": 2.0} fs = [] for i in range(1, 50): fs.append(f1) fs.append(f2) print "features=>", fs df = pd.DataFrame.from_dict(fs) t.fit(df)
def test_create_features(self): t = xgb.XGBoostClassifier(target="target",learning_rate=0.1,silent=0,objective='binary:logistic') f1 = {"target":0,"b":1.0,"c":0} f2 = {"target":1,"b":0,"c":2.0} fs = [] for i in range (1,50): fs.append(f1) fs.append(f2) print "features=>",fs df = pd.DataFrame.from_dict(fs) t.fit(df) scores = t.predict_proba(df) print scores.shape print "scores->",scores[0] preds = t.predict(df) print "predictions->",preds[0],preds[1] self.assertEquals(preds[0],0) self.assertEquals(preds[1],1)
def test_sklearn_pipeline(self): t = xgb.XGBoostClassifier(target="target",learning_rate=0.1,silent=0,objective='binary:logistic') f1 = {"target":0,"b":1.0,"c":0} f2 = {"target":1,"b":0,"c":2.0} fs = [] for i in range (1,50): fs.append(f1) fs.append(f2) print "features=>",fs df = pd.DataFrame.from_dict(fs) estimators = [("xgb",t)] p = Pipeline(estimators) p.fit(df) preds = p.predict_proba(df) print preds print "-------------------" joblib.dump(p,"/tmp/pipeline/p") p2 = joblib.load("/tmp/pipeline/p") df3 = p2.predict_proba(df) print df3
def test_kfold(self): x = xgb.XGBoostClassifier(target="target", learning_rate=0.1, silent=0, objective='binary:logistic') t = bopt.BayesOptimizer(x, { 'learning_rate': (0.01, 0.3), 'n_estimators': (10, 1000) }, param_int=['n_estimators']) f1 = {"target": 0, "b": 1.0, "c": 0} f2 = {"target": 1, "b": 0, "c": 2.0} fs = [] for i in range(1, 50): fs.append(f1) fs.append(f2) print "features=>", fs df = pd.DataFrame.from_dict(fs) t.fit(df) print t.get_params() print t.get_best_score()
def run_pipeline(events, models): tNameId = bt.Feature_id_transform(min_size=0, exclude_missing=True, zero_based=True, input_feature="name", output_feature="nameId") tAuto = pauto.Auto_transform(max_values_numeric_categorical=2, exclude=["nameId", "name"]) xgb = xg.XGBoostClassifier(target="nameId", target_readable="name", excluded=["name"], learning_rate=0.1, silent=0) transformers = [("tName", tNameId), ("tAuto", tAuto), ("xgb", xgb)] p = Pipeline(transformers) pw = sutl.Pipeline_wrapper() df = pw.create_dataframe(events) df2 = p.fit(df) pw.save_pipeline(p, models)