Python StringIndexer.groupBy Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.feature

Class/Type: StringIndexer

Method/Function: groupBy

Examples at hotexamples.com: 3

Python StringIndexer.groupBy - 3 examples found. These are the top rated real world Python examples of pyspark.ml.feature.StringIndexer.groupBy extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

Example #1

Show file

File: Databricks_Churn Prediction.py Project: Betsy-Varghese/Predictive-Modeling-Python

#Encoding string columns in merged table
table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1)

#Creating meaningful Time variables
table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("Year", year("StartDate"))

# COMMAND ----------

#Feature engineering
#Aggregating variables by CustomerID
subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), 
                                                  avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), 
                                                  min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), 
                                                  min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), 
                                                  min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"),
                                                  min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), 
                                                  min("TotalCredit"), max("TotalCredit"),
                                                  sum("TotalCredit"))

#Aggregating variables by Product Type
subs_products = table1.groupBy("CustomerID").pivot("ProductName").agg(sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), sum("NbrMealsPrice"),
                                                                      sum("ProductDiscount"), sum("TotalDiscount"), sum("TotalPrice"),
                                                                      sum("TotalCredit")).withColumnRenamed("CustomerID","cIDProduct")

#Aggregating variables by Payment Type
subs_payment_type = table1.groupBy("CustomerID").pivot("PaymentType").agg(sum("TotalPrice"), sum("TotalCredit")).withColumnRenamed("CustomerID","cIDPayment")

#Aggregating variables by Start Year of Subscription
subs_year = table1.groupBy("CustomerID").pivot("Year").agg(count("SubscriptionID")).withColumnRenamed("CustomerID","cIDYear")

Example #2

Show file

File: pyspark_tutorials.py Project: linghui-wu/LargeScaleComputing_A20

avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.regression.coefficients[4]
print(avg_ground_lga)

# RPM buckcket
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], 
    inputCol='rpm', outpuCol='rpm_bin')
# Apply bucket to rpm column
cars = bucketizer.transform(cars)
# ROM buckets
bucketed.select('rpm', 'rpm_bin').show(5)
cars.groupBy('rpm_bin').count().show()

# Engineering density
cars = cars.withColumn('density_line', cars.mass / cars.length)  # Linear density
cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2)  # Area density
cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3)  # Volume density

from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)

Example #3

Show file

File: tree 2.py Project: sh994m/Machine-Learning

from pyspark.ml.feature import StringIndexer, OneHotEncoder
df3 = StringIndexer(inputCol='Embarked',
                    outputCol='Embarked1').fit(df3).transform(df3)
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)

df3.groupBy(df3.Embarked, 'Embarked').agg({
    'Embarked': 'count',
    'Embarked1': 'sum'
}).show()
df3.show(5)

df3.show(5)
df3.show(10)
df3.schema
df3.printSchema()
# --------------------------------------------

df4.show()
df4.printSchema()

fit(si1)
male = 0
female = 1