convert_int = ["NbrMeals_EXCEP", "GrossFormulaPrice", "NetFormulaPrice", "NbrMealsPrice", "ProductDiscount", "FormulaDiscount", "TotalDiscount", "TotalPrice", "TotalCredit"]

for i in convert_int:
    table1 = table1.withColumn(i, table1[i].cast("integer"))

#Changing column types from 'Subscriptions' from string to timestamp
convert_date = ["StartDate","EndDate","RenewalDate","PaymentDate"]
    
for i in convert_date:
  table1 = table1.withColumn(i, table1[i].cast("timestamp"))
  
#Encoding string columns in merged table
table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1)

#Creating meaningful Time variables
table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("Year", year("StartDate"))

# COMMAND ----------

#Feature engineering
#Aggregating variables by CustomerID
subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), 
                                                  avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), 
                                                  min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), 
                                                  min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), 
                                                  min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"),
                                                  min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), 
                                                  min("TotalCredit"), max("TotalCredit"),
                                                  sum("TotalCredit"))
# Average minutes on ground at LGA
avg_ground_lga = inter + regression.regression.coefficients[4]
print(avg_ground_lga)

# RPM buckcket
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], 
    inputCol='rpm', outpuCol='rpm_bin')
# Apply bucket to rpm column
cars = bucketizer.transform(cars)
# ROM buckets
bucketed.select('rpm', 'rpm_bin').show(5)
cars.groupBy('rpm_bin').count().show()

# Engineering density
cars = cars.withColumn('density_line', cars.mass / cars.length)  # Linear density
cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2)  # Area density
cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3)  # Volume density

from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)

# Create a one-hot encoder
onehot = OneHotEncoderEstimator(inputCols=['depart_bucket'], outputCols=['depart_dummy'])
Exemple #3
0
##StringEncoding of categorical variables
cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"]

#df2 = df #backup in case of trouble

for cat_var in cat_x_vars:
    df = StringIndexer(inputCol=cat_var, outputCol=cat_var +
                       'Idx').fit(df).transform(df).drop(cat_var)
    df = df.withColumnRenamed(cat_var + 'Idx', cat_var)

#df.select(cat_x_vars).show(5) #check

##Create y or target variables for neural networks
#probability/indicator for default
df = df.withColumn('probDef',
                   F.when(df['loan_status'] == 1,
                          1.0).otherwise(0.0))  #default is 1, repaid is 0
#indicator for early replayment
df = df.withColumn(
    'probER',
    F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1),
           1.0).otherwise(0.0))
#indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with

#visually:
#plot of timing of either default or eventual (not early repayment)
#df.filter((df['loan_status']==1)|(df.fracNumPmts >=1)).select(df.fracNumPmts).toPandas().plot.hist()
#plt.show()  #This is  bi-modal, mostly low over 0,1 and then a spike at 1.

#plot of timing of either repayment (whenever)
#df.filter(df['loan_status']==0).select(df.fracNumPmts).toPandas().plot.hist()