Beispiel #1
0
def run_pipeline(events, models):

    tNameId = bt.Feature_id_transform(min_size=0,
                                      exclude_missing=True,
                                      zero_based=True,
                                      input_feature="name",
                                      output_feature="nameId")
    tAuto = pauto.Auto_transform(max_values_numeric_categorical=2,
                                 exclude=["nameId", "name"])
    sk_classifier = RandomForestClassifier(verbose=1)
    classifier = ske.SKLearnClassifier(clf=sk_classifier,
                                       target="nameId",
                                       excluded=["name"])

    cv = cf.Seldon_KFold(classifier, 5)
    logger.info("cross validation scores %s", cv.get_scores())

    transformers = [("tName", tNameId), ("tAuto", tAuto), ("cv", cv)]
    p = Pipeline(transformers)

    pw = sutl.Pipeline_wrapper()
    df = pw.create_dataframe(events)
    df2 = p.fit_transform(df)
    pw.save_pipeline(p, models)
    logger.info("cross validation scores %s", cv.get_scores())
Beispiel #2
0
def run_pipeline(events, models):

    tNameId = bt.Feature_id_transform(min_size=0,
                                      exclude_missing=True,
                                      zero_based=True,
                                      input_feature="name",
                                      output_feature="nameId")
    tAuto = pauto.Auto_transform(max_values_numeric_categorical=2,
                                 exclude=["nameId", "name"])
    xgb = xg.XGBoostClassifier(target="nameId",
                               target_readable="name",
                               excluded=["name"],
                               learning_rate=0.1,
                               silent=1)
    cv = cf.Seldon_KFold(xgb, 5)
    logger.info("cross validation scores %s", cv.get_scores())

    transformers = [("tName", tNameId), ("tAuto", tAuto), ("cv", cv)]
    p = Pipeline(transformers)

    pw = sutl.Pipeline_wrapper()
    df = pw.create_dataframe_from_files(events)
    df2 = p.fit_transform(df)
    pw.save_pipeline(p, models)
    logger.info("cross validation scores %s", cv.get_scores())
Beispiel #3
0
 def score(self, **params):
     for v in self.param_int:
         params[v] = int(params[v])
     self.clf.set_params(**params)
     cv = cf.Seldon_KFold(self.clf, self.cv_folds)
     cv.fit(self.X, self.y)
     return cv.get_score()
Beispiel #4
0
    def train(self, sample):

        tTfidf = ptfidf.Tfidf_transform(input_feature="review",
                                        output_feature="tfidf",
                                        target_feature="sentiment",
                                        min_df=10,
                                        max_df=0.7,
                                        select_features=False,
                                        topn_features=50000,
                                        stop_words="english",
                                        ngram_range=[1, 2])

        tFilter2 = bt.Include_features_transform(
            included=["tfidf", "sentiment"])

        svmTransform = bt.Svmlight_transform(output_feature="svmfeatures",
                                             excluded=["sentiment"],
                                             zero_based=False)

        classifier_xg = xg.XGBoostClassifier(target="sentiment",
                                             svmlight_feature="svmfeatures",
                                             silent=1,
                                             max_depth=5,
                                             n_estimators=200,
                                             objective='binary:logistic',
                                             scale_pos_weight=0.2)

        cv = cf.Seldon_KFold(classifier_xg,
                             metric='auc',
                             save_folds_folder="./folds")

        transformers = [("tTfidf", tTfidf), ("tFilter2", tFilter2),
                        ("svmTransform", svmTransform), ("cv", cv)]

        p = Pipeline(transformers)

        pw = sutl.Pipeline_wrapper()
        df = pw.create_dataframe_from_files([self.data_folder],
                                            df_format="csv")
        if sample < 1.0:
            logger.info("sampling dataset to size %s ", sample)
            df = df.sample(frac=sample, random_state=1)

        logger.info("Data frame shape %d , %d", df.shape[0], df.shape[1])

        df2 = p.fit_transform(df)
        pw.save_pipeline(p, self.model_folder)
        logger.info("cross validation scores %s", cv.get_scores())

        return p