indexed.show() # COMMAND ---------- ###Indexedtostring retrieves the original labels from the indexed label from pyspark.ml.feature import IndexToString, StringIndexer df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print( "Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # COMMAND ----------
data = df.withColumn("WeekOfYear", weekofyear(df["Date"])) data2 = data.withColumn("dayOfMonth", dayofmonth(data["Date"])) data2.printSchema() good_data = data2.select(["WeekOfYear", "dayOfMonth", "Currency Code", "Rate"]) currency_indexer = StringIndexer(inputCol='Currency Code', outputCol='CurrencyIndex') model = currency_indexer.fit(good_data) indexed = model.transform(good_data) print("Transformed string column '%s' to indexed column '%s'" % (currency_indexer.getInputCol(), currency_indexer.getOutputCol())) indexed.show() encoder = OneHotEncoder(inputCol="CurrencyIndex", outputCol="CurrencyVec") encoded = encoder.transform(indexed) encoded.show() assembler = VectorAssembler( inputCols=['WeekOfYear', 'dayOfMonth', 'CurrencyVec'], outputCol='features') output = assembler.transform(encoded) print( "Assembled columns 'WeekOfYear', 'dayOfMonth', 'CurrencyVec' to vector column 'features'" ) #output.select("features", "Rate").show(truncate=False)
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop()
pipeline = Pipeline(stages=[ indexer, assembler, multinomialRegression ]) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import StringIndexer, VectorAssembler dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer())) dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species') dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass') dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler())) dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1]) dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features') dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression())) dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass") dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features') dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline())) print("Tests passed!") # COMMAND ----------