Beispiel #1
0
def string_index(spark_df):
    """Create an index for each of the categorical column of the data set."""
    for i in spark_df.columns:
        inp_col = str(i)
        out_col = str(i) + "_indexed"
        fit_on = spark_df.select(str(i))
        df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on)
        indexed_col = df_i_indexed.select(out_col)
        print(i)
        indexed_col.printSchema()
        out_col_ohe = str(i) + "_encoded"
        try:
            df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show()
            df_i_encoded.select(out_col_ohe)
            vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features")
            vecAssembler.transform(spark_df)
        except:
            pass
    return None
encodedDF = OneHotEncoder(inputCol="WorkClass_index",
                          outputCol="WorkClass_encoded").transform(indexedDF)

# #### A WorkClass_encoded field is created
# * This contains the one-hot-encoding for WorkClass
# * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input

# In[16]:

encodedDF.toPandas().head()

# #### View the original and transformed fields together

# In[17]:

encodedDF.select('WorkClass', 'WorkClass_index',
                 'WorkClass_encoded').toPandas().head()

# ### Transform the entire dataset
# * So far we have only transformed a single column
# * We need to perform this transformation for every categorical and non-numeric column
# * This will be simplified by using a Pipeline (a feature of Spark ML)

# ####  First, split the data into training and test sets

# In[18]:

(trainingData, testData) = dataset.randomSplit([0.8, 0.2])

# #### Encode all the categorical fields in the dataset
# We begin by listing all the categorical fields
Beispiel #3
0
# _*_ coding:utf-8 _*_
'''
OneHotEncoder
'''

from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer

spark = SparkSession.builder.appName("onehotencoder").getOrCreate()

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

model = stringIndexer.fit(df)

indexed = model.transform(df)

encoder = OneHotEncoder(dropLast=False,
                        inputCol="categoryIndex",
                        outputCol="categoryvecs")

encoder = encoder.transform(indexed)

encoder.select("id", "categoryvecs")

encoder.show()