print(basetable_train.count())
print(basetable_test.count())

# COMMAND ----------

# DBTITLE 1,create the new vector variable Features
#Transform the tables in a table of label, features format
from pyspark.ml.feature import RFormula

trainBig = RFormula(formula="label ~ . - CustomerID").fit(
    final_basetable).transform(final_basetable)
train = RFormula(formula="label ~ . - CustomerID").fit(
    basetable_train).transform(basetable_train)
test = RFormula(formula="label ~ . - CustomerID").fit(
    basetable_test).transform(basetable_test)
print("trainBig nobs: " + str(trainBig.count()))
print("train nobs: " + str(train.count()))
print("test nobs: " + str(test.count()))

# COMMAND ----------

# DBTITLE 1,features selection by chisqSelection
#from pyspark.ml.feature import ChiSqSelector
#from pyspark.ml.linalg import Vectors
#selector = ChiSqSelector(numTopFeatures=15, featuresCol="features",
#                        outputCol="selectedFeatures", labelCol="label")

#result = selector.fit(train).transform(train)

#print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures(15))
#display(result)