def string_index(spark_df): """Create an index for each of the categorical column of the data set.""" for i in spark_df.columns: inp_col = str(i) out_col = str(i) + "_indexed" fit_on = spark_df.select(str(i)) df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on) indexed_col = df_i_indexed.select(out_col) print(i) indexed_col.printSchema() out_col_ohe = str(i) + "_encoded" try: df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show() df_i_encoded.select(out_col_ohe) vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features") vecAssembler.transform(spark_df) except: pass return None
encodedDF = OneHotEncoder(inputCol="WorkClass_index", outputCol="WorkClass_encoded").transform(indexedDF) # #### A WorkClass_encoded field is created # * This contains the one-hot-encoding for WorkClass # * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input # In[16]: encodedDF.toPandas().head() # #### View the original and transformed fields together # In[17]: encodedDF.select('WorkClass', 'WorkClass_index', 'WorkClass_encoded').toPandas().head() # ### Transform the entire dataset # * So far we have only transformed a single column # * We need to perform this transformation for every categorical and non-numeric column # * This will be simplified by using a Pipeline (a feature of Spark ML) # #### First, split the data into training and test sets # In[18]: (trainingData, testData) = dataset.randomSplit([0.8, 0.2]) # #### Encode all the categorical fields in the dataset # We begin by listing all the categorical fields
# _*_ coding:utf-8 _*_ ''' OneHotEncoder ''' from pyspark.sql import SparkSession from pyspark.ml.feature import OneHotEncoder, StringIndexer spark = SparkSession.builder.appName("onehotencoder").getOrCreate() df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryvecs") encoder = encoder.transform(indexed) encoder.select("id", "categoryvecs") encoder.show()