def encoding2(df, incol, outcol): encoder = OneHotEncoderEstimator(inputCols=[incol], outputCols=[outcol]) encoder = encoder.fit(df) df = encoder.transform(df) return df #, encoder
def encoding(i, df, col): encoder = OneHotEncoderEstimator(inputCols=[col], outputCols=["p" + str(i)]) encoder = encoder.fit(df) df = encoder.transform(df) return df #, encoder
def one_hot_encoder_estimator(dataset,inputCols,outputCols = None): from pyspark.ml.feature import OneHotEncoderEstimator if outputCols == None: outputCols = inputCols + '_ohee' model = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols).fit(dataset) return model.transform(dataset), model
# Exercise_1 # Import the one hot encoder class from pyspark.ml.feature import OneHotEncoderEstimator # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy']) # Apply the one hot encoder to the flights data onehot = onehot.fit(flights) flights_onehot = onehot.transform(flights) # Check the results flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show() -------------------------------------------------- # Exercise_2 from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Create a regression object and train on training data regression = LinearRegression(labelCol='duration').fit(flights_train) # Create predictions for the testing data and take a look at the predictions predictions = regression.transform(flights_test) predictions.select('duration', 'prediction').show(5, False) # Calculate the RMSE RegressionEvaluator(labelCol='duration').evaluate(predictions) -------------------------------------------------- # Exercise_3
indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) cars = cars.withColumn('density', round(cars.weight_kg / cars.length_meters, 2)) cars = cars.withColumn('density_area', round(cars.weight_kg / cars.length_meters**2, 2)) cars = cars.withColumn('density_volume', round(cars.weight_kg / cars.length_meters**3, 2)) onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy']) onehot = onehot.fit(cars) cars = onehot.transform(cars) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) pd.set_option('display.max_colwidth', 199) #print(cars.toPandas().sample(12)) # Check column data types print('\n', cars.dtypes, '\n') assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'], outputCol='features') cars = assembler.transform(cars) kars = cars.select('consumption', 'features')
# Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy('label', 'prediction').count().show() # One-hot encoding from pyspark.ml.feature import OneHotEncoderEstimator onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy']) # Fit the encoder to the data onehot = onehot.fit(cars) # How many category levels? print(onehot.categorySizes) cars = onehot.transform(cars) cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show() # Dense verse sparse from spark.mllib.linalg import DenseVector, SparseVector DenseVector([1, 0, 0, 0, 0, 7, 0, 0]) SparseVector(8, [0, 5], [1, 7]) # Import the one hot encoder class from pyspark.ml.feature import OneHotEncoderEstimator # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy']) # Apply the one hot encoder to the flights data onehot = onehot.fit(flights)
# Indexer identifies categories in the data indexer_model = indexer.fit(flights) # Indexer creates a new column with numeric index values flights_indexed = indexer_model.transform(flights) # Repeat the process for the org categorical feature flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed) # Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"]) # Apply the one hot encoder to the flights data onehot = onehot.fit(flites) flites = onehot.transform(flites) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 199) pd.set_option('display.max_colwidth', 199) # Create buckets at 3 hour intervals through the day buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol="depart", outputCol="depart_bucket") # Bucket the departure times bucketed = buckets.transform(flites) #bucketed.select("depart", "depart_bucket").show(5) # Create a one-hot encoder for departure onehot = OneHotEncoderEstimator(inputCols=["depart_bucket"], outputCols=["depart_dummy"])
indexer = StringIndexer(inputCol='type', outputCol='type_idx') # Assign index values to strings indexer = indexer.fit(cars) # Create column with index values cars = indexer.transform(cars) pd.set_option('display.max_columns', None) # all cols pd.set_option('display.width', 161) #print(cars.toPandas().sample(12)) # Check column data types print('\n', cars.dtypes, '\n') kars = cars.select('name', 'type', 'type_idx') print(kars.toPandas().sample(12)) onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy']) onehot = onehot.fit(kars) kars = onehot.transform(kars) kars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show() print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0])) print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0})) spark.stop()
outputCol='idxPclass').fit(dftrain) dftrain = sipclass.transform(dftrain) dftrain = dftrain.drop('Pclass') # In[16]: dftrain.show() # In[17]: from pyspark.ml.feature import OneHotEncoderEstimator ohe = OneHotEncoderEstimator(handleInvalid='keep', dropLast=True, inputCols=['idxPclass'], outputCols=['ohePclass']).fit(dftrain) dftrain = ohe.transform(dftrain) dftrain = dftrain.drop('idxPclass') dftrain.sample(withReplacement=False, fraction=0.1).limit(20).show() # In[18]: from pyspark.ml.feature import VectorAssembler va = VectorAssembler( inputCols=['SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass'], outputCol='features') dftrain = va.transform(dftrain) dftrain = dftrain.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass') dftrain.show() # In[19]: