# First use StringIndexer to convert categorical values to indices # In[13]: from pyspark.ml.feature import StringIndexer indexedDF = StringIndexer( inputCol='WorkClass', outputCol='WorkClass_index').fit(dataset).transform(dataset) # #### A new column called WorkClass_index is created # This stores the indexed values of WorkClass # In[14]: indexedDF.toPandas().head() # #### OneHotEncoding # Use the new indexed field to obtain a one-hot-encoded field # In[15]: from pyspark.ml.feature import OneHotEncoder encodedDF = OneHotEncoder(inputCol="WorkClass_index", outputCol="WorkClass_encoded").transform(indexedDF) # #### A WorkClass_encoded field is created # * This contains the one-hot-encoding for WorkClass # * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input
# #### Define StringIndexers for categorical columns # In[5]: from pyspark.ml.feature import StringIndexer dataset = StringIndexer(inputCol='Sex', outputCol='Gender', handleInvalid='keep').fit(dataset).transform(dataset) dataset = StringIndexer(inputCol='Embarked', outputCol='Boarded', handleInvalid='keep').fit(dataset).transform(dataset) dataset.toPandas().head() # #### Drop the redundant columns # In[6]: dataset = dataset.drop('Sex') dataset = dataset.drop('Embarked') dataset.toPandas().head() # #### Define the required features to use in the VectorAssembler # Since we are only examining data and not making predictions, we include all columns # In[7]:
# Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flites = StringIndexer( inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Check first five records #flights_indexed.show(5) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) print("Sample model input") print(flites.toPandas().sample(12)) # Create buckets at 3 hour intervals through the day buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket") # Bucket the departure times bucketed = buckets.transform(flites) bucketed.select("depart", "depart_bucket").show(5) # Create a one-hot encoder onehot = OneHotEncoderEstimator(inputCols=["depart_bucket"], outputCols=["depart_dummy"]) # One-hot encode the bucketed departure times
#Convert nulls into 0's complaints_final = complaints_final.na.fill(0) #CUSTOMERS #Check schema and first rows customers.printSchema() #Schema is ok customers.toPandas().head(5) #Find missings customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings #Renaming the CustomerID column for future joins customers = customers.withColumnRenamed("CustomerID","cIDCustomer") #DELIVERY #Check schema and first rows delivery.printSchema() #Schema is ok delivery.toPandas().head(5) #Find missings delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column #Treating missing values delivery = delivery.where(col("DeliveryClass").isNotNull()) #Encoding string columns in "Delivery" delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery) delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery) #Renaming the SubscriptionID column for future joins delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery") #FORMULA #Check schema and first rows