def train(self, sample): tTfidf = ptfidf.Tfidf_transform(input_feature="review", output_feature="tfidf", target_feature="sentiment", min_df=10, max_df=0.7, select_features=False, topn_features=50000, stop_words="english", ngram_range=[1, 2]) tFilter2 = bt.Include_features_transform( included=["tfidf", "sentiment"]) svmTransform = bt.Svmlight_transform(output_feature="svmfeatures", excluded=["sentiment"], zero_based=False) classifier_xg = xg.XGBoostClassifier(target="sentiment", svmlight_feature="svmfeatures", silent=1, max_depth=5, n_estimators=200, objective='binary:logistic', scale_pos_weight=0.2) cv = cf.Seldon_KFold(classifier_xg, metric='auc', save_folds_folder="./folds") transformers = [("tTfidf", tTfidf), ("tFilter2", tFilter2), ("svmTransform", svmTransform), ("cv", cv)] p = Pipeline(transformers) pw = sutl.Pipeline_wrapper() df = pw.create_dataframe_from_files([self.data_folder], df_format="csv") if sample < 1.0: logger.info("sampling dataset to size %s ", sample) df = df.sample(frac=sample, random_state=1) logger.info("Data frame shape %d , %d", df.shape[0], df.shape[1]) df2 = p.fit_transform(df) pw.save_pipeline(p, self.model_folder) logger.info("cross validation scores %s", cv.get_scores()) return p
def test_chi_sq(self): df = pd.DataFrame([{ "likeids": [ 102313626475894, 110544635673389, 125327794232468, 1381292875499997, 1405744859651437, 1410444839224829, 143981935773732, 1461778597368720, 159634840722133, 1606136206319782, 1617969055108441, 175338329150218, 208633805894647, 244944385603396, 245732105485076, 264050273731539, 302410296457933, 316001081653, 379092115616379, 388308794577146, 430801740267217, 446108528831354, 447497665393027, 450086458346414, 470402606305707, 488128054598370, 508960455844413, 518468471599787, 56531631380, 67920382572, 7310480740, 863556617025433, 943587665656008 ], "target": 1 }, { "likeids": [ 100318233349756, 111541275541080, 118478001539872, 1392733037644503, 14202933641, 1450755685155363, 145930618882430, 1459747570939968, 1482351465350198, 149829238430713, 1519856064948500, 1562464497303561, 173955149455632, 198812000402, 234877673256809, 258941513699, 261039940586093, 269993129865393, 277030342456725, 307688779325497, 322739367839522, 353924261385391, 376344085771485, 396795520381965, 405199642913939, 407889119354319, 461654403900239, 484970471577531, 49852438689, 511808655562689, 525243884184515, 527731407332180, 533522990127878, 544053285683692, 554468308013797, 605579839502285, 606676766022011, 663419033716761, 6665038402, 684466768336969, 747130848662418, 771257532932834, 782938198415086, 876613622381314 ], "target": 1 }]) t = tf.Tfidf_transform(input_feature="likeids", output_feature="tfidf", select_features=True, topn_features=2, target_feature="target") t.fit(df) df2 = t.transform(df) print df2 self.assertAlmostEqual(df2["tfidf"][0][u"943587665656008"], 0.174077655956)