Ejemplo n.º 1
0
def main(config):
    # Cookie cutter sequence of processes involved in running the 
    # necessary steps. Using the general pipeline outlined in Spark's
    # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html

    spark = spark_initiate()

    # some data / tramsformer
    raw_data = config['base']['train_df']
    structure_schema = model_structure()
    data = load_data(spark, raw_data, 'df', structure_schema)
    # data.show()

    df, cat_dict = transformer(data)
    datatype_dict = dict(df.dtypes)
    features = config['base']['featuresCol'].split(',')
    list_str = [] # list of string columns
    for feature in features:
        if datatype_dict[feature] == 'string':
            list_str.append(feature)
            df = StringIndexer(inputCol=feature, 
                               outputCol=feature + '_index'
                               ) \
                 .fit(df) \
                 .transform(df)
    df = df.drop(*list_str)
    df.show()
    features = list(set(df.columns) - set(config['base']['labelCol']))
    assembler = VectorAssembler(inputCols=features,
                                outputCol='features')
    df = assembler.transform(df)
    (trainingData, testData) = df.randomSplit([0.7, 0.3])

    # estimator

    model = estimators(config)
    fitted_model = model.fit(trainingData)
    testData = fitted_model.transform(testData)
    predictionAndLabels = testData.select('probability','Survived') \
                                  .rdd.map(lambda x: (float(x[0][0]),
                                                      float(x[1])
                                                      )
                                          )
    metrics = BinaryClassificationMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)

    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)
Ejemplo n.º 2
0
df2 = spark.read.csv('E:/kaggle/titanic/train_kaggle.csv', header=True)
df2.count()

# ---------------------------------------

df2.printSchema()
df3 = df2.select('Sex', 'Pclass', 'Survived', 'Embarked')
df3.show()
df3.printSchema()
# -------------------------------------------

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

df3 = StringIndexer(inputCol='Embarked',
                    outputCol='Embarked1').fit(df3).transform(df3)
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)
df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1',
                    dropLast=False).transform(df3)
df3.show()

# cast to double
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
Ejemplo n.º 3
0
    coviddeath = spark.read.csv('UScasestemp1.csv',
                                inferSchema=True,
                                header=True)
else:
    coviddeath = spark.sql("SELECT * FROM uscasestemp1_csv")

# COMMAND ----------

data = coviddeath.select("Year", "Date", "Day", "Temp", "Lat", "Long",
                         "Admin2", "Province",
                         ((col("Case") > 2).cast("Double").alias("label")))
data = StringIndexer(inputCol='Admin2',
                     outputCol='Admin2' + "_index").fit(data).transform(data)
data = StringIndexer(inputCol='Province',
                     outputCol='Province' + "_index").fit(data).transform(data)
data.show(5)

# COMMAND ----------

splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
assembler = VectorAssembler(
    inputCols=["Day", "Temp", "Lat", "Province_index", "Admin2_index"],
Ejemplo n.º 4
0
# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights = StringIndexer(inputCol="org",
                        outputCol='org_idx').fit(flights).transform(flights)

# Check first five records
flights.show(5)

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"
],
                            outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights = flights_assembled.select('features', 'xdelay')

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)
def analyze(sc, train_path, test_path):
    train_rdd = sc.textFile(train_path)
    test_rdd = sc.textFile(test_path)
    train_df = parseTrain(train_rdd)
    test_df = parseTest(test_rdd)
    train_df = train_df.withColumn('Mark', lit('train'))
    test_df = (test_df.withColumn('Survived',
                                  lit(0)).withColumn('Mark', lit('test')))
    test_df = test_df[train_df.columns]
    ## Append Test data to Train data
    df = train_df.unionAll(test_df)
    df = (df.withColumn('Age', df['Age'].cast('double')).withColumn(
        'SibSp', df['SibSp'].cast('double')).withColumn(
            'Parch', df['Parch'].cast('double')).withColumn(
                'Fare', df['Fare'].cast('double')).withColumn(
                    'Survived', df['Survived'].cast('double')))
    df.printSchema()
    numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare']
    missing = {var: countNull(df, var) for var in numVars}
    age_mean = df.groupBy().mean('Age').first()[0]
    fare_mean = df.groupBy().mean('Fare').first()[0]
    df = df.na.fill({'Age': age_mean, 'Fare': fare_mean})
    ## created user defined function to extract title
    getTitle = udf(lambda name: name.split('.')[0].strip(), StringType())
    df = df.withColumn('Title', getTitle(df['Name']))
    df.select('Name', 'Title').show(3)
    catVars = ['Pclass', 'Sex', 'Embarked', 'Title']
    si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed')
    df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed(
        'Sex_indexed', 'Sex')

    def indexer(df, col):
        si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df)
        return si

    indexers = [indexer(df, col) for col in catVars]
    pipeline = Pipeline(stages=indexers)
    df_indexed = pipeline.fit(df).transform(df)
    df_indexed.select('Embarked', 'Embarked_indexed').show(10)
    catVarsIndexed = [i + '_indexed' for i in catVars]
    featuresCol = numVars + catVarsIndexed
    featuresCol.remove('Survived')
    labelCol = ['Mark', 'Survived']
    row = Row('mark', 'label', 'features')
    df_indexed = df_indexed[labelCol + featuresCol]
    # 0-mark, 1-label, 2-features
    # map features to DenseVector
    lf = df_indexed.rdd.map(lambda r:
                            (row(r[0], r[1], DenseVector(r[2:])))).toDF()
    # index label
    # convert numeric label to categorical, which is required by
    # decisionTree and randomForest
    lf = StringIndexer(inputCol='label',
                       outputCol='index').fit(lf).transform(lf)
    lf.show(3)
    train = lf.where(lf.mark == 'train')
    test = lf.where(lf.mark == 'test')
    # random split further to get train/validate
    train, validate = train.randomSplit([0.7, 0.3], seed=121)
    print('Train Data Number of Row: ' + str(train.count()))
    print('Validate Data Number of Row: ' + str(validate.count()))
    print('Test Data Number of Row: ' + str(test.count()))
    lr = LogisticRegression(maxIter=100, regParam=0.05,
                            labelCol='index').fit(train)

    # Evaluate model based on auc ROC(default for binary classification)
    def testModel(model, validate=validate):
        pred = model.transform(validate)
        evaluator = BinaryClassificationEvaluator(labelCol='index')
        return evaluator.evaluate(pred)

    print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr)))
    dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
    rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)
    models = {
        'LogisticRegression': lr,
        'DecistionTree': dt,
        'RandomForest': rf
    }
    modelPerf = {k: testModel(v) for k, v in models.iteritems()}
    print(modelPerf)
Ejemplo n.º 6
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer

df_train = spark.read.parquet("./cf_train_subsampled.parquet")
df_val = spark.read.parquet("./cf_validation_subsampled.parquet")
df_test = spark.read.parquet("./cf_test_subsampled.parquet")

df_train.createOrReplaceTempView('df_train')
df_val.createOrReplaceTempView('df_val')
df_test.createOrReplaceTempView('df_test')

df_train = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train).transform(df_train)
df_train = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train).transform(df_train)
df_train.show()
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(rank=10, maxIter=10, regParam=0.1, alpha=1.0, 
      userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count",
            coldStartStrategy="drop")
model = als.fit(df_train)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
def main():
    spark, sc = init_spark()

    df = (spark.read.format("csv").option(
        'header', 'true').load("C:\\sparkTmp\\titanic_train.csv"))

    df.show(5)

    # How many rows we have
    print(df.count())

    # The names of our columns
    print(df.columns)

    # Types of our columns
    print(df.dtypes)

    print(df.describe())

    dataset = df.select(
        col("Survived").cast("float"),
        col("Pclass").cast("float"),
        col("Sex"),
        col("Age").cast("float"),
        col("Fare").cast("float"),
        col("Embarked"),
    )

    dataset.show()

    dataset.select(
        [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()
    dataset = dataset.dropna(how="any")
    dataset.select(
        [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()

    # We need to transform Sex and Embarked to numerical value
    dataset = StringIndexer(
        inputCol="Sex", outputCol="Gender",
        handleInvalid="keep").fit(dataset).transform(dataset)

    dataset = StringIndexer(
        inputCol="Embarked", outputCol="Boarded",
        handleInvalid="keep").fit(dataset).transform(dataset)

    # StringIndexer transforms not just to a plain double, but preserves category
    print(dataset.schema.fields[7].metadata)

    dataset = dataset.drop("Sex")
    dataset = dataset.drop("Embarked")

    dataset.show()

    required_features = ["Pclass", "Age", "Fare", "Gender", "Boarded"]

    assembler = VectorAssembler(inputCols=required_features,
                                outputCol='features')

    transformed_data = assembler.transform(dataset)
    transformed_data.show()

    (training_data, test_data) = transformed_data.randomSplit([0.8, 0.2])
    rf = RandomForestClassifier(labelCol="Survived",
                                featuresCol="features",
                                maxDepth=5)
    model = rf.fit(training_data)
    predictions = model.transform(test_data)

    evaluator = MulticlassClassificationEvaluator(labelCol="Survived",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Test Accuracy = ", accuracy)
Ejemplo n.º 8
0
featuresCol = [c for c in df.columns if c not in {'OUTCOME'}]

row = Row('unscaledFeatures', 'unlabel')

# df = df[featuresCol + labelCol]

# 0-features, 1-label
# map features to DenseVector
lf = df.map(lambda r: (row(DenseVector(r[:-1]), r[-1]))).toDF()

# index label
# convert numeric label to categorical, which is required by
# decisionTree and randomForest
lf = StringIndexer(inputCol='unlabel', outputCol='label').fit(lf).transform(lf)

lf.show(3)

# Double check that the data version is correct after conversion.

# In[10]:

lf.take(2)

# <a id="context24"></a>
# ### 2.4. Features Standardization
#
# The features are not in the same unit, in order to make future algorithms to coverge fast, we should do standardization. Here we standardize each feature to have zero mean and unit variance.

# In[11]:

from pyspark.ml.feature import StandardScaler
#Read "Subscriptions" data
subscriptions = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path + "BDT2_1920_Subscriptions.csv")

# COMMAND ----------

#Subsetting Data for Train and Validation

last_date = '2018-08-31'
#Complaints
complaints = complaints.withColumn("ComplaintDate", to_date(col("ComplaintDate"), "yyyy-MM-dd")).filter(col("ComplaintDate")<=lit(last_date))
complaints = complaints.orderBy(desc('ComplaintDate'))
complaints.show(5)
#Delivery
delivery = delivery.withColumn("DeliveryDate", to_date(col("DeliveryDate"), "yyyy-MM-dd")).filter(col("DeliveryDate")<=lit(last_date))
delivery = delivery.orderBy(desc('DeliveryDate'))
delivery.show(5)
#Subscriptions
subscriptions = subscriptions.withColumn("EndDate", to_date(col("EndDate"), "yyyy-MM-dd")).filter(col("EndDate")<=lit(last_date))
subscriptions = subscriptions.orderBy(desc('EndDate'))
subscriptions.show(5)

# COMMAND ----------

#COMPLAINTS
#Check schema and first rows
complaints.printSchema() #Schema is ok
complaints.toPandas().head(5)

#Changing column types from string to integer
convert_int = ["ProductID","ComplaintTypeID","SolutionTypeID", "FeedbackTypeID"]
Ejemplo n.º 10
0
# read data from csv files to dataframe
df2 = spark.read.csv('E:/kaggle/titanic/train_kaggle.csv', header=True)
df2.count()
df2.describe().show()
# ---------------------------------------

df2.printSchema()
df3 = df2.select('Sex', 'Pclass', 'Survived', 'Embarked')
df3.show()
df3.printSchema()
# -------------------------------------------
from pyspark.ml.feature import StringIndexer, OneHotEncoder
df3 = StringIndexer(inputCol='Embarked',
                    outputCol='Embarked1').fit(df3).transform(df3)
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)

df3.groupBy(df3.Embarked, 'Embarked').agg({
    'Embarked': 'count',
    'Embarked1': 'sum'
}).show()
df3.show(5)
Ejemplo n.º 11
0
dataset.show(5)

dataset.select([count(when(isnull(c), c)).alias(c)
                for c in dataset.columns]).show()

dataset = dataset.replace('?', None).dropna(how='any')

dataset = StringIndexer(inputCol='Sex',
                        outputCol='Gender',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset = StringIndexer(inputCol='Embarked',
                        outputCol='Boarded',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset.show()

dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')

# Assemble all the features with VectorAssembler
required_features = ['Pclass', 'Age', 'Fare', 'Gender', 'Boarded']

assembler = VectorAssembler(inputCols=required_features, outputCol='features')
transformed_data = assembler.transform(dataset)

(training_data, test_data) = transformed_data.randomSplit([0.8, 0.2])

rf = RandomForestClassifier(labelCol='Survived',
                            featuresCol='features',
                            maxDepth=5)
Ejemplo n.º 12
0
# In[87]:


orgidxer_df= StringIndexer(inputCol='origin', outputCol='origin_idx').fit(indexer_df).transform(indexer_df)


# In[90]:


desidxer_df= StringIndexer(inputCol='dest', outputCol='dest_idx').fit(orgidxer_df).transform(orgidxer_df)


# In[91]:


desidxer_df.show(5)


# In[94]:


df2= desidxer_df.drop('carrier', 'origin','dest')


# In[95]:


df2.show(5)


# In[93]:
Ejemplo n.º 13
0
bucketizer.setHandleInvalid("skip").transform(df).show()

discretizers = [ft.QuantileDiscretizer(inputCol=c, outputCol="{}_buckets".format(c))
                for c in numeric_features]
# discretizers = ft.QuantileDiscretizer(numBuckets=3,inputCols=numeric_features, outputCols=["{}_buckets".format(c) for c in numeric_features])


# StringIndexer
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexer = indexer.fit(df).transform(df)
indexer.show()

# labelConveter
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer

df = spark.createDataFrame(
    [(0, "Yes"), (1, "Yes"), (2, "Yes"), (3, "No"), (4, "No"), (5, "No")],
    ["id", "label"])

indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
converter = IndexToString(inputCol="labelIndex", outputCol="originalLabel")
pipeline = Pipeline(stages=[indexer, converter])
model = pipeline.fit(df)
result = model.transform(df)
result.show()
Ejemplo n.º 14
0
df1.count()
df1.printSchema()
df1.describe().show()
df1.groupby('type').count().show()
df1.groupby('color').count().sort('count', ascending=False).show()
# --------------------------------------------------------------------------
# cast to double
df2 = df1.select(df1.id.cast('double'), df1.bone_length.cast('double'),
                 df1.rotting_flesh.cast('double'),
                 df1.hair_length.cast('double'), df1.has_soul.cast('double'),
                 'color', 'type')

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
df3 = StringIndexer(inputCol='color',
                    outputCol='color1').fit(df2).transform(df2)
df3.show()
df3.printSchema()
df3 = OneHotEncoder(inputCol='color1', outputCol='color2',
                    dropLast=False).transform(df3)
df3.printSchema()
df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3)
df4.show()
df4.printSchema()

# Vector assembler
df5 = VectorAssembler(inputCols=[
    'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2'
],
                      outputCol='Features').transform(df4)
df5.show(truncate=False)
df5.printSchema()
Ejemplo n.º 15
0
# Show Schema details of the spark Dataframe
df2.printSchema()

# creating new spark data frame based on the selected columns
df3 = df2.select('Pclass', 'Survived', 'Sex', 'Embarked')
df3.show(n=5)
df3.printSchema()

# Aggregation on Specific cloumn
df3.groupBy('Embarked').agg({'Embarked': 'count'}).show()

# show summary of cloumns
df3.describe().show()

# Applying OneHot on Embared Column

# Step 1 : Convert the column to set of numbers using String Indexer. Hear each category will be assigned with a number based on its occurences (i.e Mode)
df3 = StringIndexer(inputCol='Embarked',
                    outputCol='EntryPoints').fit(df3).transform(df3)
df3.show(n=15)

# Step 2: Apply OneHot on Newly created StringIndexer column
df3 = OneHotEncoder(inputCol='EntryPoints',
                    outputCol='EntryLocations',
                    dropLast=False).transform(df3)
df3.show(15)

# comibine all feature columns in to one column to pass it to the model
df3 = VectorAssembler(inputCols=[''])
Ejemplo n.º 16
0
      StructField("C14", DoubleType(), False),
      StructField("C15", DoubleType(), False),
      StructField("C16", DoubleType(), False),
      StructField("C17", DoubleType(), False),
      StructField("C18", DoubleType(), False),
      StructField("C19", DoubleType(), False),
      StructField("C20", DoubleType(), False),
      StructField("C21", DoubleType(), False)])
# Get file
df = sqlContext.read.format("com.databricks.spark.csv").options(header= 'true').schema(customSchema).load("file:///home/bigdatas16/Downloads/train100K.csv")
# Displays the content of the DataFrame to stdout
df.show()

from pyspark.ml.feature import StringIndexer
data = StringIndexer(inputCol="click", outputCol="label").fit(df).transform(df)
data.show()

# RFormula
from pyspark.ml.feature import RFormula
formula = RFormula(formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_model + C14 + C17 + C18 + C19 + C21 ", featuresCol="features", labelCol="label")
output = formula.fit(data).transform(data)
data1 = output.select("label", "features")
data1.show()

# Split training and test data.
    #(training, test) = data1.randomSplit([0.7, 0.3], seed = 12345)
training, test = data1.randomSplit([0.7, 0.3], seed = 12345)
training.show()
    
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest).
from pyspark.ml.classification import LogisticRegression
      StructField("C14", DoubleType(), True),
      StructField("C15", DoubleType(), True),
      StructField("C16", DoubleType(), True),
      StructField("C17", DoubleType(), True),
      StructField("C18", DoubleType(), True),
      StructField("C19", DoubleType(), True),
      StructField("C20", DoubleType(), True),
      StructField("C21", DoubleType(), True)
    ])
    

from pyspark.ml.feature import StringIndexer
## Index labels, adding metadata to the label column.
## Fit on whole dataset to include all labels in index.
data = StringIndexer(inputCol="click", outputCol="label").fit(data).transform(data)
data.show()
## 可產生另一個檔案.transform(data)不一定要在(data)檔案裡
#labelIndexer  ===> data


# RFormula
from pyspark.ml.feature import RFormula
## RFormula: string input colums will be one-hot encoded, and numeric columns will be cast to doubles.
##特徵值要被修正formula" "
formula = RFormula(
    formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + device_type + device_conn_type",
    #formula="label ~ banner_pos + app_id + site_category + site_id + site_domain + C14 + C17 + C18 + C19 + C21", #0.707636
    #formula="label ~ banner_pos + site_id + site_domain + C14 + C17 + C21", #0.7
    featuresCol="features",
    labelCol="label")
formula_data = formula.fit(data).transform(data)
Ejemplo n.º 18
0
flights = flights.dropna()
print("\nThe data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

flites = flights_indexed.select('carrier', 'org', 'org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flights_onehot = onehot.transform(flites)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

spark.stop()
Ejemplo n.º 19
0
df2 = spark.read.csv('/users/jyothsnap/Kaggle/titanic/train.csv',header=True)
df2.count()

# ---------------------------------------

df3 = df2.select('Sex','Pclass','Survived','Embarked')
df3.show()
df3.printSchema()

from pyspark.ml.feature import StringIndexer
df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3)
#df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3.show()
df3.printSchema()

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
Ejemplo n.º 20
0
df1.count()
df1.printSchema()
df1.describe().show()
df1.groupby('type').count().show()
df1.groupby('color').count().sort('count', ascending=False).show()
# --------------------------------------------------------------------------
# cast to double
df2 = df1.select(df1.id.cast('double'), df1.bone_length.cast('double'),
                 df1.rotting_flesh.cast('double'),
                 df1.hair_length.cast('double'), df1.has_soul.cast('double'),
                 'color', 'type')

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
df3 = StringIndexer(inputCol='color',
                    outputCol='color1').fit(df2).transform(df2)
df3.show()
df3.printSchema()
df3 = OneHotEncoder(inputCol='color1', outputCol='color2',
                    dropLast=False).transform(df3)
df3.printSchema()
df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3)
df4.show()
df4.printSchema()

# Vector assembler
df5 = VectorAssembler(inputCols=[
    'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2'
],
                      outputCol='Features').transform(df4)
df5.show(truncate=False)
df5.printSchema()