Example #1
0
 def test_numpy_input(self):
     t = xgb.XGBoostClassifier(n_estimators=10, learning_rate=0.1, silent=0)
     X = np.random.randn(6, 4)
     y = np.array([0, 1, 1, 0, 0, 1])
     t.fit(X, y)
     scores = t.predict_proba(X)
     print scores
Example #2
0
def run_pipeline(events, models):

    tNameId = bt.Feature_id_transform(min_size=0,
                                      exclude_missing=True,
                                      zero_based=True,
                                      input_feature="name",
                                      output_feature="nameId")
    tAuto = pauto.Auto_transform(max_values_numeric_categorical=2,
                                 exclude=["nameId", "name"])
    xgb = xg.XGBoostClassifier(target="nameId",
                               target_readable="name",
                               excluded=["name"],
                               learning_rate=0.1,
                               silent=1)
    cv = cf.Seldon_KFold(xgb, 5)
    logger.info("cross validation scores %s", cv.get_scores())

    transformers = [("tName", tNameId), ("tAuto", tAuto), ("cv", cv)]
    p = Pipeline(transformers)

    pw = sutl.Pipeline_wrapper()
    df = pw.create_dataframe_from_files(events)
    df2 = p.fit_transform(df)
    pw.save_pipeline(p, models)
    logger.info("cross validation scores %s", cv.get_scores())
Example #3
0
 def test_set_params(self):
     t = xgb.XGBoostClassifier(target="target",
                               learning_rate=0.1,
                               silent=0,
                               objective='binary:logistic')
     t.set_params(learning_rate=0.9, gamma=0.1)
     self.assertEquals(t.get_params()['learning_rate'], 0.9)
Example #4
0
 def test_svmlight_features(self):
     t = xgb.XGBoostClassifier(target="target",
                               svmlight_feature="svm",
                               learning_rate=0.1,
                               silent=0,
                               objective='binary:logistic')
     df = pd.DataFrame([{"svm": [(1, 2.0), (2, 3.0)], "target": 1}])
     t.fit(df)
Example #5
0
    def train(self, sample):

        tTfidf = ptfidf.Tfidf_transform(input_feature="review",
                                        output_feature="tfidf",
                                        target_feature="sentiment",
                                        min_df=10,
                                        max_df=0.7,
                                        select_features=False,
                                        topn_features=50000,
                                        stop_words="english",
                                        ngram_range=[1, 2])

        tFilter2 = bt.Include_features_transform(
            included=["tfidf", "sentiment"])

        svmTransform = bt.Svmlight_transform(output_feature="svmfeatures",
                                             excluded=["sentiment"],
                                             zero_based=False)

        classifier_xg = xg.XGBoostClassifier(target="sentiment",
                                             svmlight_feature="svmfeatures",
                                             silent=1,
                                             max_depth=5,
                                             n_estimators=200,
                                             objective='binary:logistic',
                                             scale_pos_weight=0.2)

        cv = cf.Seldon_KFold(classifier_xg,
                             metric='auc',
                             save_folds_folder="./folds")

        transformers = [("tTfidf", tTfidf), ("tFilter2", tFilter2),
                        ("svmTransform", svmTransform), ("cv", cv)]

        p = Pipeline(transformers)

        pw = sutl.Pipeline_wrapper()
        df = pw.create_dataframe_from_files([self.data_folder],
                                            df_format="csv")
        if sample < 1.0:
            logger.info("sampling dataset to size %s ", sample)
            df = df.sample(frac=sample, random_state=1)

        logger.info("Data frame shape %d , %d", df.shape[0], df.shape[1])

        df2 = p.fit_transform(df)
        pw.save_pipeline(p, self.model_folder)
        logger.info("cross validation scores %s", cv.get_scores())

        return p
 def test_kfold(self):
     x = xgb.XGBoostClassifier(target="target",
                               learning_rate=0.1,
                               silent=0,
                               objective='binary:logistic')
     t = cf.Seldon_KFold(x, 3)
     f1 = {"target": 0, "b": 1.0, "c": 0}
     f2 = {"target": 1, "b": 0, "c": 2.0}
     fs = []
     for i in range(1, 50):
         fs.append(f1)
         fs.append(f2)
     print "features=>", fs
     df = pd.DataFrame.from_dict(fs)
     t.fit(df)
Example #7
0
 def test_create_features(self):
     t = xgb.XGBoostClassifier(target="target",learning_rate=0.1,silent=0,objective='binary:logistic')
     f1 = {"target":0,"b":1.0,"c":0}
     f2 = {"target":1,"b":0,"c":2.0}
     fs = []
     for i in range (1,50):
         fs.append(f1)
         fs.append(f2)
     print "features=>",fs
     df = pd.DataFrame.from_dict(fs)
     t.fit(df)
     scores = t.predict_proba(df)
     print scores.shape
     print "scores->",scores[0]
     preds = t.predict(df)
     print "predictions->",preds[0],preds[1]
     self.assertEquals(preds[0],0)
     self.assertEquals(preds[1],1)
Example #8
0
 def test_sklearn_pipeline(self):
     t = xgb.XGBoostClassifier(target="target",learning_rate=0.1,silent=0,objective='binary:logistic')
     f1 = {"target":0,"b":1.0,"c":0}
     f2 = {"target":1,"b":0,"c":2.0}
     fs = []
     for i in range (1,50):
         fs.append(f1)
         fs.append(f2)
     print "features=>",fs
     df = pd.DataFrame.from_dict(fs)
     estimators = [("xgb",t)]
     p = Pipeline(estimators)
     p.fit(df)
     preds = p.predict_proba(df)
     print preds
     print "-------------------"
     joblib.dump(p,"/tmp/pipeline/p")
     p2 = joblib.load("/tmp/pipeline/p")
     df3 = p2.predict_proba(df)
     print df3
Example #9
0
 def test_kfold(self):
     x = xgb.XGBoostClassifier(target="target",
                               learning_rate=0.1,
                               silent=0,
                               objective='binary:logistic')
     t = bopt.BayesOptimizer(x, {
         'learning_rate': (0.01, 0.3),
         'n_estimators': (10, 1000)
     },
                             param_int=['n_estimators'])
     f1 = {"target": 0, "b": 1.0, "c": 0}
     f2 = {"target": 1, "b": 0, "c": 2.0}
     fs = []
     for i in range(1, 50):
         fs.append(f1)
         fs.append(f2)
     print "features=>", fs
     df = pd.DataFrame.from_dict(fs)
     t.fit(df)
     print t.get_params()
     print t.get_best_score()
Example #10
0
def run_pipeline(events, models):

    tNameId = bt.Feature_id_transform(min_size=0,
                                      exclude_missing=True,
                                      zero_based=True,
                                      input_feature="name",
                                      output_feature="nameId")
    tAuto = pauto.Auto_transform(max_values_numeric_categorical=2,
                                 exclude=["nameId", "name"])
    xgb = xg.XGBoostClassifier(target="nameId",
                               target_readable="name",
                               excluded=["name"],
                               learning_rate=0.1,
                               silent=0)

    transformers = [("tName", tNameId), ("tAuto", tAuto), ("xgb", xgb)]
    p = Pipeline(transformers)

    pw = sutl.Pipeline_wrapper()
    df = pw.create_dataframe(events)
    df2 = p.fit(df)
    pw.save_pipeline(p, models)