#Encoding string columns in merged table
table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1)

#Creating meaningful Time variables
table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("Year", year("StartDate"))

# COMMAND ----------

#Feature engineering
#Aggregating variables by CustomerID
subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), 
                                                  avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), 
                                                  min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), 
                                                  min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), 
                                                  min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"),
                                                  min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), 
                                                  min("TotalCredit"), max("TotalCredit"),
                                                  sum("TotalCredit"))

#Aggregating variables by Product Type
subs_products = table1.groupBy("CustomerID").pivot("ProductName").agg(sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), sum("NbrMealsPrice"),
                                                                      sum("ProductDiscount"), sum("TotalDiscount"), sum("TotalPrice"),
                                                                      sum("TotalCredit")).withColumnRenamed("CustomerID","cIDProduct")

#Aggregating variables by Payment Type
subs_payment_type = table1.groupBy("CustomerID").pivot("PaymentType").agg(sum("TotalPrice"), sum("TotalCredit")).withColumnRenamed("CustomerID","cIDPayment")

#Aggregating variables by Start Year of Subscription
subs_year = table1.groupBy("CustomerID").pivot("Year").agg(count("SubscriptionID")).withColumnRenamed("CustomerID","cIDYear")
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.regression.coefficients[4]
print(avg_ground_lga)

# RPM buckcket
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], 
    inputCol='rpm', outpuCol='rpm_bin')
# Apply bucket to rpm column
cars = bucketizer.transform(cars)
# ROM buckets
bucketed.select('rpm', 'rpm_bin').show(5)
cars.groupBy('rpm_bin').count().show()

# Engineering density
cars = cars.withColumn('density_line', cars.mass / cars.length)  # Linear density
cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2)  # Area density
cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3)  # Volume density

from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)
Example #3
0
from pyspark.ml.feature import StringIndexer, OneHotEncoder
df3 = StringIndexer(inputCol='Embarked',
                    outputCol='Embarked1').fit(df3).transform(df3)
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)

df3.groupBy(df3.Embarked, 'Embarked').agg({
    'Embarked': 'count',
    'Embarked1': 'sum'
}).show()
df3.show(5)

df3.show(5)
df3.show(10)
df3.schema
df3.printSchema()
# --------------------------------------------

df4.show()
df4.printSchema()

fit(si1)
male = 0
female = 1