Example #1
0
from pyspark.mllib.linalg import DenseVector
from pyspark.ml.linalg import DenseVector
from pyspark.mllib.linalg import Vectors, VectorUDT

row = Row('mark','label','features')

df_indexed = df_indexed[labelCol+featuresCol]
# 0-mark, 1-label, 2-features

#changed
lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1],DenseVector(r[2:])))).toDF()
# index label 
lf = StringIndexer(inputCol = 'label',outputCol='index').fit(lf).transform(lf)

# split back train/test data
train = lf.where(lf.mark =='train')
test = lf.where(lf.mark =='test')

# random split further to get train/validate
train,validate = train.randomSplit([0.7,0.3],seed =121)

print 'Train Data Number of Row: '+ str(train.count())
print 'Validate Data Number of Row: '+ str(validate.count())
print 'Test Data Number of Row: '+ str(test.count())

# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train)
def analyze(sc, train_path, test_path):
    train_rdd = sc.textFile(train_path)
    test_rdd = sc.textFile(test_path)
    train_df = parseTrain(train_rdd)
    test_df = parseTest(test_rdd)
    train_df = train_df.withColumn('Mark', lit('train'))
    test_df = (test_df.withColumn('Survived',
                                  lit(0)).withColumn('Mark', lit('test')))
    test_df = test_df[train_df.columns]
    ## Append Test data to Train data
    df = train_df.unionAll(test_df)
    df = (df.withColumn('Age', df['Age'].cast('double')).withColumn(
        'SibSp', df['SibSp'].cast('double')).withColumn(
            'Parch', df['Parch'].cast('double')).withColumn(
                'Fare', df['Fare'].cast('double')).withColumn(
                    'Survived', df['Survived'].cast('double')))
    df.printSchema()
    numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare']
    missing = {var: countNull(df, var) for var in numVars}
    age_mean = df.groupBy().mean('Age').first()[0]
    fare_mean = df.groupBy().mean('Fare').first()[0]
    df = df.na.fill({'Age': age_mean, 'Fare': fare_mean})
    ## created user defined function to extract title
    getTitle = udf(lambda name: name.split('.')[0].strip(), StringType())
    df = df.withColumn('Title', getTitle(df['Name']))
    df.select('Name', 'Title').show(3)
    catVars = ['Pclass', 'Sex', 'Embarked', 'Title']
    si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed')
    df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed(
        'Sex_indexed', 'Sex')

    def indexer(df, col):
        si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df)
        return si

    indexers = [indexer(df, col) for col in catVars]
    pipeline = Pipeline(stages=indexers)
    df_indexed = pipeline.fit(df).transform(df)
    df_indexed.select('Embarked', 'Embarked_indexed').show(10)
    catVarsIndexed = [i + '_indexed' for i in catVars]
    featuresCol = numVars + catVarsIndexed
    featuresCol.remove('Survived')
    labelCol = ['Mark', 'Survived']
    row = Row('mark', 'label', 'features')
    df_indexed = df_indexed[labelCol + featuresCol]
    # 0-mark, 1-label, 2-features
    # map features to DenseVector
    lf = df_indexed.rdd.map(lambda r:
                            (row(r[0], r[1], DenseVector(r[2:])))).toDF()
    # index label
    # convert numeric label to categorical, which is required by
    # decisionTree and randomForest
    lf = StringIndexer(inputCol='label',
                       outputCol='index').fit(lf).transform(lf)
    lf.show(3)
    train = lf.where(lf.mark == 'train')
    test = lf.where(lf.mark == 'test')
    # random split further to get train/validate
    train, validate = train.randomSplit([0.7, 0.3], seed=121)
    print('Train Data Number of Row: ' + str(train.count()))
    print('Validate Data Number of Row: ' + str(validate.count()))
    print('Test Data Number of Row: ' + str(test.count()))
    lr = LogisticRegression(maxIter=100, regParam=0.05,
                            labelCol='index').fit(train)

    # Evaluate model based on auc ROC(default for binary classification)
    def testModel(model, validate=validate):
        pred = model.transform(validate)
        evaluator = BinaryClassificationEvaluator(labelCol='index')
        return evaluator.evaluate(pred)

    print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr)))
    dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
    rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)
    models = {
        'LogisticRegression': lr,
        'DecistionTree': dt,
        'RandomForest': rf
    }
    modelPerf = {k: testModel(v) for k, v in models.iteritems()}
    print(modelPerf)
#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

#Find missings
formula.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in formula.columns)) #No missings