print(basetable_train.count()) print(basetable_test.count()) # COMMAND ---------- # DBTITLE 1,create the new vector variable Features #Transform the tables in a table of label, features format from pyspark.ml.feature import RFormula trainBig = RFormula(formula="label ~ . - CustomerID").fit( final_basetable).transform(final_basetable) train = RFormula(formula="label ~ . - CustomerID").fit( basetable_train).transform(basetable_train) test = RFormula(formula="label ~ . - CustomerID").fit( basetable_test).transform(basetable_test) print("trainBig nobs: " + str(trainBig.count())) print("train nobs: " + str(train.count())) print("test nobs: " + str(test.count())) # COMMAND ---------- # DBTITLE 1,features selection by chisqSelection #from pyspark.ml.feature import ChiSqSelector #from pyspark.ml.linalg import Vectors #selector = ChiSqSelector(numTopFeatures=15, featuresCol="features", # outputCol="selectedFeatures", labelCol="label") #result = selector.fit(train).transform(train) #print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures(15)) #display(result)