Example #1
0
df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5)
df5.show()

df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5)
df5.show()

# --------------------------------------------

df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5)
df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5)
df5.show()


df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId)
df5.printSchema()

# Vector assembler

df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5)
df5.show(truncate=False)


df5_1 = model2.transform(df5)
df5_1.show()

df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv')

#  df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')

Example #2
0
df2 = spark.read.csv('/users/jyothsnap/Kaggle/titanic/train.csv',header=True)
df2.count()

# ---------------------------------------

df3 = df2.select('Sex','Pclass','Survived','Embarked')
df3.show()
df3.printSchema()

from pyspark.ml.feature import StringIndexer
df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3)
#df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3.show()
df3.printSchema()

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')
#Convert nulls into 0's
complaints_final = complaints_final.na.fill(0)
#CUSTOMERS
#Check schema and first rows
customers.printSchema() #Schema is ok
customers.toPandas().head(5)

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA