Python StringIndexer.where Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.feature

Class/Type: StringIndexer

Method/Function: where

Examples at hotexamples.com: 3

Python StringIndexer.where - 3 examples found. These are the top rated real world Python examples of pyspark.ml.feature.StringIndexer.where extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

Example #1

Show file

from pyspark.mllib.linalg import DenseVector
from pyspark.ml.linalg import DenseVector
from pyspark.mllib.linalg import Vectors, VectorUDT

row = Row('mark','label','features')

df_indexed = df_indexed[labelCol+featuresCol]
# 0-mark, 1-label, 2-features

#changed
lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1],DenseVector(r[2:])))).toDF()
# index label 
lf = StringIndexer(inputCol = 'label',outputCol='index').fit(lf).transform(lf)

# split back train/test data
train = lf.where(lf.mark =='train')
test = lf.where(lf.mark =='test')

# random split further to get train/validate
train,validate = train.randomSplit([0.7,0.3],seed =121)

print 'Train Data Number of Row: '+ str(train.count())
print 'Validate Data Number of Row: '+ str(validate.count())
print 'Test Data Number of Row: '+ str(test.count())

# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train)

Example #2

Show file

File: titanic_spark.py Project: R-I-S-Khan/Article-Classifier-using-Apache-Spark

def analyze(sc, train_path, test_path):
    train_rdd = sc.textFile(train_path)
    test_rdd = sc.textFile(test_path)
    train_df = parseTrain(train_rdd)
    test_df = parseTest(test_rdd)
    train_df = train_df.withColumn('Mark', lit('train'))
    test_df = (test_df.withColumn('Survived',
                                  lit(0)).withColumn('Mark', lit('test')))
    test_df = test_df[train_df.columns]
    ## Append Test data to Train data
    df = train_df.unionAll(test_df)
    df = (df.withColumn('Age', df['Age'].cast('double')).withColumn(
        'SibSp', df['SibSp'].cast('double')).withColumn(
            'Parch', df['Parch'].cast('double')).withColumn(
                'Fare', df['Fare'].cast('double')).withColumn(
                    'Survived', df['Survived'].cast('double')))
    df.printSchema()
    numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare']
    missing = {var: countNull(df, var) for var in numVars}
    age_mean = df.groupBy().mean('Age').first()[0]
    fare_mean = df.groupBy().mean('Fare').first()[0]
    df = df.na.fill({'Age': age_mean, 'Fare': fare_mean})
    ## created user defined function to extract title
    getTitle = udf(lambda name: name.split('.')[0].strip(), StringType())
    df = df.withColumn('Title', getTitle(df['Name']))
    df.select('Name', 'Title').show(3)
    catVars = ['Pclass', 'Sex', 'Embarked', 'Title']
    si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed')
    df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed(
        'Sex_indexed', 'Sex')

    def indexer(df, col):
        si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df)
        return si

    indexers = [indexer(df, col) for col in catVars]
    pipeline = Pipeline(stages=indexers)
    df_indexed = pipeline.fit(df).transform(df)
    df_indexed.select('Embarked', 'Embarked_indexed').show(10)
    catVarsIndexed = [i + '_indexed' for i in catVars]
    featuresCol = numVars + catVarsIndexed
    featuresCol.remove('Survived')
    labelCol = ['Mark', 'Survived']
    row = Row('mark', 'label', 'features')
    df_indexed = df_indexed[labelCol + featuresCol]
    # 0-mark, 1-label, 2-features
    # map features to DenseVector
    lf = df_indexed.rdd.map(lambda r:
                            (row(r[0], r[1], DenseVector(r[2:])))).toDF()
    # index label
    # convert numeric label to categorical, which is required by
    # decisionTree and randomForest
    lf = StringIndexer(inputCol='label',
                       outputCol='index').fit(lf).transform(lf)
    lf.show(3)
    train = lf.where(lf.mark == 'train')
    test = lf.where(lf.mark == 'test')
    # random split further to get train/validate
    train, validate = train.randomSplit([0.7, 0.3], seed=121)
    print('Train Data Number of Row: ' + str(train.count()))
    print('Validate Data Number of Row: ' + str(validate.count()))
    print('Test Data Number of Row: ' + str(test.count()))
    lr = LogisticRegression(maxIter=100, regParam=0.05,
                            labelCol='index').fit(train)

    # Evaluate model based on auc ROC(default for binary classification)
    def testModel(model, validate=validate):
        pred = model.transform(validate)
        evaluator = BinaryClassificationEvaluator(labelCol='index')
        return evaluator.evaluate(pred)

    print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr)))
    dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
    rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)
    models = {
        'LogisticRegression': lr,
        'DecistionTree': dt,
        'RandomForest': rf
    }
    modelPerf = {k: testModel(v) for k, v in models.iteritems()}
    print(modelPerf)

Example #3

Show file

File: Databricks_Churn Prediction.py Project: Betsy-Varghese/Predictive-Modeling-Python

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

#Find missings
formula.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in formula.columns)) #No missings