Example #1
0
# First use StringIndexer to convert categorical values to indices

# In[13]:

from pyspark.ml.feature import StringIndexer

indexedDF = StringIndexer(
    inputCol='WorkClass',
    outputCol='WorkClass_index').fit(dataset).transform(dataset)

# #### A new column called WorkClass_index is created
# This stores the indexed values of WorkClass

# In[14]:

indexedDF.toPandas().head()

# #### OneHotEncoding
# Use the new indexed field to obtain a one-hot-encoded field

# In[15]:

from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(inputCol="WorkClass_index",
                          outputCol="WorkClass_encoded").transform(indexedDF)

# #### A WorkClass_encoded field is created
# * This contains the one-hot-encoding for WorkClass
# * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input
# #### Define StringIndexers for categorical columns

# In[5]:

from pyspark.ml.feature import StringIndexer

dataset = StringIndexer(inputCol='Sex',
                        outputCol='Gender',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset = StringIndexer(inputCol='Embarked',
                        outputCol='Boarded',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset.toPandas().head()

# #### Drop the redundant columns

# In[6]:

dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')

dataset.toPandas().head()

# #### Define the required features to use in the VectorAssembler
# Since we are only examining data and not making predictions, we include all columns

# In[7]:
# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flites = StringIndexer(
    inputCol="org",
    outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
#flights_indexed.show(5)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)

print("Sample model input")
print(flites.toPandas().sample(12))

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24],
                     inputCol="depart",
                     outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flites)
bucketed.select("depart", "depart_bucket").show(5)

# Create a one-hot encoder
onehot = OneHotEncoderEstimator(inputCols=["depart_bucket"],
                                outputCols=["depart_dummy"])

# One-hot encode the bucketed departure times
#Convert nulls into 0's
complaints_final = complaints_final.na.fill(0)
#CUSTOMERS
#Check schema and first rows
customers.printSchema() #Schema is ok
customers.toPandas().head(5)

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows