from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol="Gender", outputCol="GenderIndex") indexed = indexer.fit(data).transform(data)
from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler indexers = [StringIndexer(inputCol=col, outputCol=col+"_index").fit(data) for col in categorical_columns] assembler = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers], outputCol="features") pipeline = Pipeline(stages=indexers + [assembler]) model = pipeline.fit(data) data = model.transform(data)Package Library: PySpark ML (pyspark.ml)