def piStrOneHotEncoding(featurename, dataframe): from pyspark.ml.feature import OneHotEncoder from pyspark.ml.feature import StringIndexer #from pyspark.ml.feature import VectorIndexer indexed = dataframe indexer = StringIndexer(inputCol=featurename, outputCol=featurename + "HE") indexed = indexer.fit(indexed).transform(indexed) encoder = OneHotEncoder(inputCols=[featurename + "HE"], outputCols=[featurename + "OHE"]) indexed = encoder.fit(indexed).transform(indexed) def convertSparseVectortoDenseVectorInt(v): v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array toDenseVectorUdfInt = F.udf(convertSparseVectortoDenseVectorInt, T.ArrayType(T.IntegerType())) from pyspark.ml.feature import Interaction, VectorAssembler assembler1 = VectorAssembler(inputCols=[featurename + "OHE"], outputCol="vec1") assembled1 = assembler1.transform(indexed) a = assembled1.toPandas() indexed = indexed.drop(featurename).drop(featurename + "HE").withColumn( featurename, toDenseVectorUdfInt(featurename + "OHE")).drop(featurename + "OHE") #indexer = VectorIndexer(inputCol=featurename+"OHE", outputCol=featurename+"tHE", maxCategories=10) #indexerModel = indexer.fit(indexed) #indexed = indexerModel.transform(indexed) return indexed
def add_pref_user_system(spark): """ For each user, add the mode of its prefered system (iPad, Macos, Windows...) as a dummy variable with one-hot-encoding. Input: - spark (SparkSession): A initiated pyspark.sql.SparkSession """ pref_user_system = """ SELECT userId, MAX(userSystem) as pref_user_system FROM df_table GROUP BY userId """ features_df = add_features(spark, pref_user_system, ['pref_user_system']) # Fill null values of the operating system, i.e. "unknown" features_df = features_df.fillna("unknown", subset=['pref_user_system']) # Index the prefered user system (macos, iphone etc...) stringIndexer = StringIndexer(inputCol="pref_user_system", outputCol="pref_user_system_ind", stringOrderType="frequencyDesc") features_df = stringIndexer.fit(features_df).transform(features_df) # Apply one-hot-encoding for the user system ohe = OneHotEncoder(inputCol="pref_user_system_ind", outputCol="pref_user_system_ohe", dropLast=True) ohe_model = ohe.fit(features_df) features_df = ohe_model.transform(features_df) features_df.createOrReplaceTempView("features_df") return features_df
def oneHotEncodeColumns(df, cols): newdf = df for c in cols: ohe = OneHotEncoder(inputCol=c, outputCol=c+'-onehot', dropLast=False) ohe_model = ohe.fit(newdf) newdf = ohe_model.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+'-onehot', c) return newdf
def one_hot_example(data: DataFrame): data_with_id_number = data.withColumn("movieIdNumber", F.col("movieId").cast("Integer")) encoder = OneHotEncoder(inputCol="movieIdNumber", outputCol="movieIdVector") model = encoder.fit(data_with_id_number) result = model.transform(data_with_id_number) print_info(result)
def add_origin(df, trained_model=None): from pyspark.ml.feature import OneHotEncoder, StringIndexer if not trained_model: indexer = StringIndexer(inputCol='ORIGIN', outputCol='origin_index') trained_model = indexer.fit(df) indexed = trained_model.transform(df) encoder = OneHotEncoder(inputCol='origin_index', outputCol='origin_onehot') return trained_model, encoder.fit(indexed).transform(indexed)
def oneHotEncoderExample(movieSamples): samplesWithIdNumber = movieSamples.withColumn( "movieIdNumber", F.col("movieId").cast(IntegerType())) encoder = OneHotEncoder(inputCols=["movieIdNumber"], outputCols=["movieIdVector"], dropLast=False) oneHotEncoderSamples = encoder.fit(samplesWithIdNumber).transform( samplesWithIdNumber) oneHotEncoderSamples.printSchema() oneHotEncoderSamples.show(10)
def transform_data(data): ''' Transform dataset Input: (data) Spark DataFrame Output: Transformed DataFrame ''' # Save features that are already numerical data_num = data.select('age', 'hypertension', 'heart_disease', 'avg_glucose_level') # Use StringIndexer to transform categories to numerical values inputs = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'] outputs = ['gender_i', 'ever_married_i', 'work_type_i', 'Residence_type_i', 'smoking_status_i'] indexer = StringIndexer(inputCols=inputs, outputCols=outputs) indexed = indexer.fit(data).transform(data) indexed = indexed.select(*outputs) # Use OneHotEncoder to map the numerical values to vectors encoder = OneHotEncoder(inputCols=indexed.columns, outputCols=inputs) encoded = encoder.fit(indexed).transform(indexed) encoded = encoded.select(*inputs) # Combine numerical features into a single DataFrame w = Window.orderBy(lit(1)) data_num = data_num.withColumn('rn', row_number().over(w)-1) encoded = encoded.withColumn('rn', row_number().over(w)-1) combined_data = data_num.join(encoded, ['rn']).drop('rn') # Combine features into a single feature column using VectorAssembler assembler = VectorAssembler(inputCols=combined_data.columns, outputCol='features') assembled = assembler.transform(combined_data) # Convert sparse vectors to NumPy arrays assembled = assembled.toPandas() assembled['features'] = assembled['features'].apply(np.asarray) # Transform feature arrays to columns new_columns = range(len(df['features'][0])) new_data = assembled.features.to_list() assembled = assembled.DataFrame(new_data, columns=new_columns) return assembled
def oneHotEncodeColumns(self, df, cols, save_path): ''' :param df: 输入spark.DataFrame :param cols: 需要编码的列名 :return: 编码后的新spark.DataFrame ''' newdf = df num = 0 total = len(cols) print("正在onehot特征化...") @udf(ArrayType(IntegerType())) def toDense(v): print(v) print(Vectors.dense(v).toArray()) v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array for c in cols: num += 1 # print("{0}/{1} 正在onehot特征:{2}".format(num, total, c)) onehotEncoderPath = save_path + "/onehot-" + c print("{0}/{1} 正在onehot特征:{2}".format(num, total, c)) stringIndexer = StringIndexer(inputCol=c, outputCol=c + "Index", handleInvalid="keep") model = stringIndexer.fit(df) indexed = model.transform(df) ohe = OneHotEncoder(inputCol=c + "Index", outputCol=c + "-sparse", dropLast=False) newdf = ohe.fit(indexed).transform(indexed) # newdf = newdf.withColumnRenamed(c + "-onehot", c) newdf = newdf.withColumn( c + "-onehot", toDense(c + "-sparse")).drop(c + "-sparse") # newdf = newdf.withColumnRenamed(c + "-onehot", c) ohe.write().overwrite().save(onehotEncoderPath) print("完成onehot特征化!") # newdf.withColumn('updatetime', pyf.current_timestamp()) # newdf.write.mode("overwrite").saveAsTable("mkt_mldb_tmp.TRAIN_dfhotable") # return newdf
def oneHotEncoderExample(movieSamples): # 增加一列movieIdNumber整数电影ID列 samplesWithIdNumber = movieSamples.withColumn( "movieIdNumber", F.col("movieId").cast(IntegerType())) # pyspark.ml.feature 特征工程,movieIdNumber列做OneHot编码到movieIdVector # dropLast=True会导致使用全0的onehot向量表示最后一个category,这不适合我们 # 一共1000部电影,之所以有1001列onehot是因为没有ID为0的电影 encoder = OneHotEncoder(inputCols=["movieIdNumber"], outputCols=['movieIdVector'], dropLast=False) # 执行转换 oneHotEncoderSamples = encoder.fit(samplesWithIdNumber).transform( samplesWithIdNumber) # 打印schema oneHotEncoderSamples.printSchema() # 打印数据 oneHotEncoderSamples.show(10, False)
def test_model_onehot_encoder(self): encoder = OneHotEncoder(inputCols=['index'], outputCols=['indexVec']) data = self.spark.createDataFrame( [(0.0,), (1.0,), (2.0,), (2.0,), (0.0,), (2.0,)], ['index']) model = encoder.fit(data) model_onnx = convert_sparkml( model, 'Sparkml OneHotEncoder', [('index', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) data_np = data.select("index").toPandas().values.astype(numpy.float32) predicted_np = predicted.select("indexVec").toPandas().indexVec.apply( lambda x: x.toArray().tolist()).values expected = numpy.asarray( [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneHotEncoder") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['indexVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def one_hot_indexer_example(): # https://spark.apache.org/docs/latest/mllib-data-types.html#local-vector # https://stackoverflow.com/questions/42295001/how-to-interpret-results-of-spark-onehotencoder#42297197 df = spark.createDataFrame([ (0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (3.0, 1.0), (2.0, 0.0) ], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoder( inputCols=[ "categoryIndex1", "categoryIndex2"], outputCols=[ "categoryVec1", "categoryVec2"], dropLast=False) model = encoder.fit(df) encoded = model.transform(df) encoded.show()
# limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import OneHotEncoder # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("OneHotEncoderExample")\ .getOrCreate() # Note: categorical features are usually first encoded with StringIndexer # $example on$ df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0)], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"]) model = encoder.fit(df) encoded = model.transform(df) encoded.show() # $example off$ spark.stop()
from pyspark.ml.feature import OneHotEncoder # COMMAND ---------- # MAGIC %md # MAGIC ##### Dealing with Cruise Line column: turn string to index and apply onehot encoding # COMMAND ---------- # StringIndex + Onehot for the Cruiseline Column: indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruiseline_Index') indexed = indexer.fit(dataset).transform(dataset) encoder = OneHotEncoder(inputCols=['Cruiseline_Index'], outputCols=['Cruiseline_Onehot']) encoded = encoder.fit(indexed).transform(indexed) encoded.show() # COMMAND ---------- # MAGIC %md # MAGIC ##### Construct the Feature column for the model by using VectorAssembler # COMMAND ---------- assembler = VectorAssembler(inputCols=[ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'Cruiseline_Onehot' ], outputCol='features')
.getOrCreate() # TODO add timer # TODO check resource usage # Load training data data = spark.read.options(header=True, inferSchema=True)\ .csv("../UCI_repo/Abalone/abalone.csv") data.cache() print(data.printSchema()) print(data.describe().toPandas().transpose()) indexer = StringIndexer(inputCol='Sex', outputCol='Sex_cat') indexed = indexer.fit(data).transform(data) onehot = OneHotEncoder(inputCol='Sex_cat', outputCol='Sex_onehot') onehot_encoded = onehot.fit(indexed).transform(indexed) for item in onehot_encoded.head(3): print(item) print('\n') assembler = VectorAssembler(inputCols=['Length', 'Diameter', 'Height', 'Whole weight',\ 'Shucked weight', 'Viscera weight', 'Shell weight',\ 'Sex_onehot'], outputCol='features') output = assembler.transform(onehot_encoded) final_data = output.select('features', 'Rings') train_data, test_data = final_data.randomSplit([0.7, 0.3]) test_data.describe().show(3)
# $example on$ from pyspark.ml.feature import OneHotEncoder # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("OneHotEncoderExample")\ .getOrCreate() # Note: categorical features are usually first encoded with StringIndexer # $example on$ df = spark.createDataFrame([ (0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0) ], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"]) model = encoder.fit(df) encoded = model.transform(df) encoded.show() # $example off$ spark.stop()
}).show() # group by and compute statistics df = df.withColumn( 'age-square', col('age')**2) # add a column named age-square that is the square of age # df = df.select([...]) # can easily change the order of columns df.filter(df['native-country'] == 'Holand-Netherlands').count() # count dutch df.groupby('native-country').agg({ 'native-country': 'count' }).sort(asc('count(native-country)')).show( ) # show statistics that shows the count of each native-country ordered by the count asc df_remove = df.filter( df['native-country'] != 'Holand-Netherlands') # remove dutch cases stringIndexer = StringIndexer(inputCol='workclass', outputCol='workclass-encoded' ) # to one-hot-encode, transform a string column model = stringIndexer.fit(df) indexed = model.transform( df) # workclass transformed to workclass-encoded (float) encoder = OneHotEncoder(dropLast=False, inputCol='workclass-encoded', outputCol='workclass-vec') one_hot_model = encoder.fit(indexed) encoded_df = encoded.transform(indexed)
string_indexer = StringIndexer(inputCol="country_code", outputCol="country_indexed", handleInvalid="skip") indexed_df = string_indexer.fit(w2v_df).transform(w2v_df) indexed_df.select("country_code", "country_indexed").display() # COMMAND ---------- # MAGIC %md Transforming label indices into binary vectors using OneHotEncoder # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder ohe = OneHotEncoder(inputCol="country_indexed", outputCol="country_ohe") ohe_df = ohe.fit(indexed_df).transform(indexed_df) ohe_df.select("country_code", "country_ohe").display() # COMMAND ---------- # MAGIC %md Run StringIndexer on `quantity` column as well. # COMMAND ---------- qty_indexer = StringIndexer(inputCol="quantity", outputCol="quantity_indexed", handleInvalid="skip") qty_df = qty_indexer.fit(ohe_df).transform(ohe_df) qty_df.select("quantity", "quantity_indexed").display() # COMMAND ----------
if args.mode != 'test': dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration'))) dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration')) dataset = dataset.withColumn('duration', F.exp(F.col('duration'))) stringIndex_model = None if args.mode == 'train': stringIndexer = StringIndexer(inputCol='source', outputCol='source_index') stringIndex_model = stringIndexer.fit(dataset) stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') else: stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2') dataset = stringIndex_model.transform(dataset) encoder_model = None if args.mode == 'train': encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec') encoder_model = encoder.fit(dataset) encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') else: encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2') dataset = encoder_model.transform(dataset) feature_cols = ['source_vec', 'aging', 'PC1', 'PC2', 'PC3', 'PC4'] assembler = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') dataset = assembler.transform(dataset) scaler_model = None if args.mode == 'train': scaler = StandardScaler(inputCol='feature_vec', outputCol='scaled_feature_vec', withStd=True, withMean=True) scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset)
string_index_encoder = StringIndexer( inputCols=categorical_columns, outputCols=[c + '_str_ind' for c in categorical_columns], stringOrderType='alphabetAsc') train_car_data_frame = string_index_encoder.fit(car_data_frame).transform( car_data_frame) train_car_data_frame.head(1) # In[ ]: one_hot_encoder = OneHotEncoder( inputCols=[c + '_str_ind' for c in categorical_columns], outputCols=[c + '_vec' for c in categorical_columns], dropLast=False) train_car_data_frame = one_hot_encoder.fit(train_car_data_frame).transform( train_car_data_frame) train_car_data_frame.head(1) # In[ ]: numeric_columns = [ 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
#One hot encoding the columns which have been converted from cat to numerical #Categorical columns that need to be one hot encoded cat_columns = [ 'sex_index', 'marital_index', 'relationship_index', 'workclass_index', 'occupation_index' ] #new column names after OHE cat_columns_enc = [col + '_enc' for col in cat_columns] ohe = OneHotEncoder(inputCols=cat_columns, outputCols=cat_columns_enc, dropLast=True) ohe = ohe.fit(df2) df2 = ohe.transform(df2) #dropping the older columns for col in cat_columns: df2 = df2.drop(col) print('Dataframe and columns after OHE:') df2.show(2) print(df2.columns) # Converting the preprocessed columns into vectors # Pyspark expects the features to be coalesced together, so this snippet of code # combines all the features that will be used for prediction of the target # and stores them in the same dataframe under a new column 'vectors'
from pyspark.ml.feature import StandardScaler scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False) scalerModel = scaler.fit(irisLpDf) irisNormalizedDf = scalerModel.transform(irisLpDf) irisNormalizedDf.printSchema() """ Label OneHotEncoder (Estimator \ the Transformer is now deprecated)""" print("One Hot Encoder") from pyspark.ml.feature import OneHotEncoder encoder = OneHotEncoder(dropLast=False, inputCol="label", outputCol="encoded_label") encoder_model = encoder.fit(irisNormalizedDf) irisNormalizedencodedDf = encoder_model.transform(irisNormalizedDf) irisNormalizedencodedDf.printSchema() """ PCA Principal Component Analysis""" print("PCA Analysis") from pyspark.ml.feature import PCA pca = PCA(k=3, inputCol="scaled_features", outputCol="pcaFeatures") pca_model = pca.fit(irisNormalizedencodedDf) iris_pcaDf = pca_model.transform(irisNormalizedencodedDf) iris_pcaDf.printSchema() print( "PCA model explained variance", pca_model.explainedVariance, "Cumulative explained variance", pca_model.explainedVariance[0] + pca_model.explainedVariance[1] + pca_model.explainedVariance[2]) """ Define the final DataFrame : apply the pipeline"""
def test_log_stage_type_params(spark_session): from pyspark.ml.base import Estimator, Transformer, Model from pyspark.ml.evaluation import Evaluator from pyspark.ml.param import Param, Params from pyspark.ml.feature import Binarizer, OneHotEncoder class TestingEstimator(Estimator): transformer = Param(Params._dummy(), "transformer", "a transformer param") model = Param(Params._dummy(), "model", "a model param") evaluator = Param(Params._dummy(), "evaluator", "an evaluator param") def setTransformer(self, transformer: Transformer): return self._set(transformer=transformer) def setModel(self, model: Model): return self._set(model=model) def setEvaluator(self, evaluator: Evaluator): return self._set(evaluator=evaluator) def _fit(self, dataset): return TestingModel() class TestingModel(Model): def _transform(self, dataset): return dataset binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") df = spark_session.createDataFrame([(0.0, ), (1.0, ), (2.0, )], ["input"]) ohe = OneHotEncoder().setInputCols(["input"]).setOutputCols(["output"]) ohemodel = ohe.fit(df) bcd = BinaryClassificationEvaluator(metricName="areaUnderROC") estimator = TestingEstimator().setTransformer(binarizer).setModel( ohemodel).setEvaluator(bcd) param_map = get_params_to_log(estimator) assert param_map["transformer"] == "Binarizer" assert param_map["model"] == "OneHotEncoderModel" assert param_map["evaluator"] == "BinaryClassificationEvaluator" mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: estimator.fit(df) metadata = _gen_estimator_metadata(estimator) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] assert isinstance(estimator_info["hierarchy"]["params"], dict) assert estimator_info["hierarchy"]["params"]["transformer"][ "name"] == "Binarizer" assert estimator_info["hierarchy"]["params"]["model"][ "name"] == "OneHotEncoderModel" assert (estimator_info["hierarchy"]["params"]["evaluator"]["name"] == "BinaryClassificationEvaluator") run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(get_params_to_log(estimator)))
scaler = StandardScaler(withMean=True, withStd=True) scaler.setInputCol("features").setOutputCol("scaled_features") scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing) scaledHousing.select('scaled_features').show() # 3-2 bu tai ming bai zhe li????????????????????????????? distinct = renamedHousing.select('ocean_proximity').distinct().collect() print(distinct) renamedHousing.agg(countDistinct("ocean_proximity")).show() indexer = StringIndexer().setInputCol('ocean_proximity').setOutputCol('idx_ocean_proximity') idxHousing = indexer.fit(renamedHousing).transform(renamedHousing) idxHousing.show() encoder = OneHotEncoder().setInputCol('idx_ocean_proximity').setOutputCol('one_hot_ocean_proximity') ohHousing = encoder.fit(idxHousing).transform(idxHousing) ohHousing.show() #4 numPipeline = [imputer,va,scaler] catPipeline = [indexer,encoder] pipeline = Pipeline(stages=numPipeline) newHousing = pipeline.fit(renamedHousing).transform(renamedHousing) newHousing = newHousing.drop('features') newHousing.show() pipeline = pipeline.setStages(catPipeline) newHousing = pipeline.fit(newHousing).transform(newHousing) newHousing.show() va2 = VectorAssembler().setInputCols(['scaled_features','one_hot_ocean_proximity']).setOutputCol('features')
indexer.fit(spark_df).transform(spark_df).show(5) temp_sdf = indexer.fit(spark_df).transform(spark_df) spark_df = temp_sdf.withColumn("gender_label", temp_sdf["gender_label"].cast("integer")) spark_df = spark_df.drop('segment') spark_df = spark_df.drop('geography') spark_df = spark_df.drop('gender') ################################################## # One Hot Encoding ################################################## encoder = OneHotEncoder(inputCols=["age_cat"], outputCols=["age_cat_ohe"]) spark_df = encoder.fit(spark_df).transform(spark_df) encoder = OneHotEncoder(inputCols=["hascrcard"], outputCols=["hascrcard_cat_ohe"]) spark_df = encoder.fit(spark_df).transform(spark_df) ################################################## # definition of TARGET ################################################## # definition of TARGET stringIndexer = StringIndexer(inputCol='exited', outputCol='label') temp_sdf = stringIndexer.fit(spark_df).transform(spark_df) spark_df = temp_sdf.withColumn("label", temp_sdf["label"].cast("integer")) ##################################################
categorical_features_names = [ 'tonal_chords_key', 'tonal_chords_scale', 'tonal_key_key', 'tonal_key_scale' ] indexed_names = list( map(lambda x: x + '_indexed', categorical_features_names)) encoded_names = list(map(lambda x: x + '_encoded', indexed_names)) indexer = StringIndexer(inputCols=categorical_features_names, outputCols=indexed_names).setHandleInvalid("keep") df_ = indexer.fit(df_).transform(df_) \ .drop(*categorical_features_names) encoder = OneHotEncoder(inputCols=indexed_names, outputCols=encoded_names, dropLast=False) df_ = encoder.fit(df_).transform(df_) for name in encoded_names: df_ = df_.withColumn(name + "_arr", one_hot_vector_to_array(col(name))) # cleanup for name in indexed_names: df_ = df_.drop(name) for name in encoded_names: df_ = df_.drop(name) # flatten array columns df_ = flatten_df_arrays(df_) \ .persist(StorageLevel.DISK_ONLY) feature_columns = df_.columns feature_columns.remove('rec_MBID')