Ejemplo n.º 1
0
    def train(self, sample):

        tTfidf = ptfidf.Tfidf_transform(input_feature="review",
                                        output_feature="tfidf",
                                        target_feature="sentiment",
                                        min_df=10,
                                        max_df=0.7,
                                        select_features=False,
                                        topn_features=50000,
                                        stop_words="english",
                                        ngram_range=[1, 2])

        tFilter2 = bt.Include_features_transform(
            included=["tfidf", "sentiment"])

        svmTransform = bt.Svmlight_transform(output_feature="svmfeatures",
                                             excluded=["sentiment"],
                                             zero_based=False)

        classifier_xg = xg.XGBoostClassifier(target="sentiment",
                                             svmlight_feature="svmfeatures",
                                             silent=1,
                                             max_depth=5,
                                             n_estimators=200,
                                             objective='binary:logistic',
                                             scale_pos_weight=0.2)

        cv = cf.Seldon_KFold(classifier_xg,
                             metric='auc',
                             save_folds_folder="./folds")

        transformers = [("tTfidf", tTfidf), ("tFilter2", tFilter2),
                        ("svmTransform", svmTransform), ("cv", cv)]

        p = Pipeline(transformers)

        pw = sutl.Pipeline_wrapper()
        df = pw.create_dataframe_from_files([self.data_folder],
                                            df_format="csv")
        if sample < 1.0:
            logger.info("sampling dataset to size %s ", sample)
            df = df.sample(frac=sample, random_state=1)

        logger.info("Data frame shape %d , %d", df.shape[0], df.shape[1])

        df2 = p.fit_transform(df)
        pw.save_pipeline(p, self.model_folder)
        logger.info("cross validation scores %s", cv.get_scores())

        return p
 def test_chi_sq(self):
     df = pd.DataFrame([{
         "likeids": [
             102313626475894, 110544635673389, 125327794232468,
             1381292875499997, 1405744859651437, 1410444839224829,
             143981935773732, 1461778597368720, 159634840722133,
             1606136206319782, 1617969055108441, 175338329150218,
             208633805894647, 244944385603396, 245732105485076,
             264050273731539, 302410296457933, 316001081653,
             379092115616379, 388308794577146, 430801740267217,
             446108528831354, 447497665393027, 450086458346414,
             470402606305707, 488128054598370, 508960455844413,
             518468471599787, 56531631380, 67920382572, 7310480740,
             863556617025433, 943587665656008
         ],
         "target":
         1
     }, {
         "likeids": [
             100318233349756, 111541275541080, 118478001539872,
             1392733037644503, 14202933641, 1450755685155363,
             145930618882430, 1459747570939968, 1482351465350198,
             149829238430713, 1519856064948500, 1562464497303561,
             173955149455632, 198812000402, 234877673256809, 258941513699,
             261039940586093, 269993129865393, 277030342456725,
             307688779325497, 322739367839522, 353924261385391,
             376344085771485, 396795520381965, 405199642913939,
             407889119354319, 461654403900239, 484970471577531, 49852438689,
             511808655562689, 525243884184515, 527731407332180,
             533522990127878, 544053285683692, 554468308013797,
             605579839502285, 606676766022011, 663419033716761, 6665038402,
             684466768336969, 747130848662418, 771257532932834,
             782938198415086, 876613622381314
         ],
         "target":
         1
     }])
     t = tf.Tfidf_transform(input_feature="likeids",
                            output_feature="tfidf",
                            select_features=True,
                            topn_features=2,
                            target_feature="target")
     t.fit(df)
     df2 = t.transform(df)
     print df2
     self.assertAlmostEqual(df2["tfidf"][0][u"943587665656008"],
                            0.174077655956)