Esempio n. 1
0
def indexer(dataset):
    dataset = StringIndexer(
        inputCol='Sex',
        outputCol='Gender',
        handleInvalid='keep').fit(dataset).transform(dataset)
    dataset = StringIndexer(
        inputCol='Embarked',
        outputCol='Boarded',
        handleInvalid='keep').fit(dataset).transform(dataset)
    dataset.drop('Sex', 'Embarked')
    return dataset
def encode_using_one_hot(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed.
    ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories)
    '''
    indexed_name = 'index_'+column_name
    vectored_name = 'vec_'+column_name

    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)

    encoder = OneHotEncoderEstimator(
        inputCols=[indexed_name], outputCols=[vectored_name])
    model = encoder.fit(df)
    df = model.transform(df)
    df = df.drop(indexed_name)
    df = df.drop(column_name)
    df = df.withColumnRenamed(vectored_name, column_name)
    return df
def encode_using_indexer(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to an index as they get processed.
    ie. If values in column are {a,b,b,c,d} => {0.0,1.0,1.0,2.0,3.0} (Good for Binary)
    '''
    indexed_name = 'index_'+column_name
    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)
    df = df.drop(column_name)
    df = df.withColumnRenamed(indexed_name, column_name)
    return df
Esempio n. 4
0
def main(config):
    # Cookie cutter sequence of processes involved in running the 
    # necessary steps. Using the general pipeline outlined in Spark's
    # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html

    spark = spark_initiate()

    # some data / tramsformer
    raw_data = config['base']['train_df']
    structure_schema = model_structure()
    data = load_data(spark, raw_data, 'df', structure_schema)
    # data.show()

    df, cat_dict = transformer(data)
    datatype_dict = dict(df.dtypes)
    features = config['base']['featuresCol'].split(',')
    list_str = [] # list of string columns
    for feature in features:
        if datatype_dict[feature] == 'string':
            list_str.append(feature)
            df = StringIndexer(inputCol=feature, 
                               outputCol=feature + '_index'
                               ) \
                 .fit(df) \
                 .transform(df)
    df = df.drop(*list_str)
    df.show()
    features = list(set(df.columns) - set(config['base']['labelCol']))
    assembler = VectorAssembler(inputCols=features,
                                outputCol='features')
    df = assembler.transform(df)
    (trainingData, testData) = df.randomSplit([0.7, 0.3])

    # estimator

    model = estimators(config)
    fitted_model = model.fit(trainingData)
    testData = fitted_model.transform(testData)
    predictionAndLabels = testData.select('probability','Survived') \
                                  .rdd.map(lambda x: (float(x[0][0]),
                                                      float(x[1])
                                                      )
                                          )
    metrics = BinaryClassificationMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)

    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)
Esempio n. 5
0
def load_csv(sc, filename='200[0-5].csv'):
    sql_context = SQLContext(sc)
    df = sql_context.read.option('mode', 'PERMISSIVE')\
                            .load(filename,
                            format='com.databricks.spark.csv',
                            header='true',
                            nullValue='NA',
                            inferSchema='true').cache()
    df = df[FEATURE_USED]
    df = df.na.drop()
    # turn string to index
    for col in ['UniqueCarrier', 'Origin', 'Dest']:
        df = StringIndexer(inputCol=col,
                           outputCol=col + '_value').fit(df).transform(df)
        df = df.drop(col)

    # reordering
    df = df.select([
        'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
        'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value',
        'Dest_value', 'Distance', 'Cancelled'
    ])
    return df
dataset = StringIndexer(inputCol='Sex',
                        outputCol='Gender',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset = StringIndexer(inputCol='Embarked',
                        outputCol='Boarded',
                        handleInvalid='keep').fit(dataset).transform(dataset)

dataset.toPandas().head()

# #### Drop the redundant columns

# In[6]:

dataset = dataset.drop('Sex')
dataset = dataset.drop('Embarked')

dataset.toPandas().head()

# #### Define the required features to use in the VectorAssembler
# Since we are only examining data and not making predictions, we include all columns

# In[7]:

requiredFeatures = ['Survived', 'Pclass', 'Age', 'Fare', 'Gender', 'Boarded']

# #### The VectorAssembler vectorises all the features
# The transformed data will be used for clustering

# In[8]:
Esempio n. 7
0
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier

spark = SparkSession.builder.getOrCreate()
df2 = spark.read.csv(
    'C:/Manidhar/MachineLearningLab/datasets/titanic/train.csv', header=True)
df2.show()
df2.describe().show()
df2.printSchema()
df3 = df2.select('Pclass', 'Sex', 'Survived')
df3.printSchema()
df3.show()
df4 = df2.filter(df2.Age > 40).select('Pclass', 'SibSp', 'Survived')
df4.show()
df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)
df3 = df3.drop('Sex')
df3.show()

df4.count()
df5 = df2.filter(df2.Age > 20).select('Pclass', 'SibSp', 'Survived')
df5.count()
df3 = df3.select(df3.Pclass.cast('double'), df3.SibSp.cast('double'),
                 df3.Survived.cast('double'), df3.Fare.cast('double'))
df3.printSchema()
df3 = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Fare'],
                      outputCol='Features').transform(df3)
df3.show()

dt1 = DecisionTreeClassifier(featuresCol='Features',
                             labelCol='Survived',
                             maxDepth=10,
Esempio n. 8
0
# In[90]:


desidxer_df= StringIndexer(inputCol='dest', outputCol='dest_idx').fit(orgidxer_df).transform(orgidxer_df)


# In[91]:


desidxer_df.show(5)


# In[94]:


df2= desidxer_df.drop('carrier', 'origin','dest')


# In[95]:


df2.show(5)


# In[93]:


desidxer_df.show(5)


# In[101]:
def main():
    spark, sc = init_spark()

    df = (spark.read.format("csv").option(
        'header', 'true').load("C:\\sparkTmp\\titanic_train.csv"))

    df.show(5)

    # How many rows we have
    print(df.count())

    # The names of our columns
    print(df.columns)

    # Types of our columns
    print(df.dtypes)

    print(df.describe())

    dataset = df.select(
        col("Survived").cast("float"),
        col("Pclass").cast("float"),
        col("Sex"),
        col("Age").cast("float"),
        col("Fare").cast("float"),
        col("Embarked"),
    )

    dataset.show()

    dataset.select(
        [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()
    dataset = dataset.dropna(how="any")
    dataset.select(
        [count(when(isnull(c), c)).alias(c) for c in dataset.columns]).show()

    # We need to transform Sex and Embarked to numerical value
    dataset = StringIndexer(
        inputCol="Sex", outputCol="Gender",
        handleInvalid="keep").fit(dataset).transform(dataset)

    dataset = StringIndexer(
        inputCol="Embarked", outputCol="Boarded",
        handleInvalid="keep").fit(dataset).transform(dataset)

    # StringIndexer transforms not just to a plain double, but preserves category
    print(dataset.schema.fields[7].metadata)

    dataset = dataset.drop("Sex")
    dataset = dataset.drop("Embarked")

    dataset.show()

    required_features = ["Pclass", "Age", "Fare", "Gender", "Boarded"]

    assembler = VectorAssembler(inputCols=required_features,
                                outputCol='features')

    transformed_data = assembler.transform(dataset)
    transformed_data.show()

    (training_data, test_data) = transformed_data.randomSplit([0.8, 0.2])
    rf = RandomForestClassifier(labelCol="Survived",
                                featuresCol="features",
                                maxDepth=5)
    model = rf.fit(training_data)
    predictions = model.transform(test_data)

    evaluator = MulticlassClassificationEvaluator(labelCol="Survived",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Test Accuracy = ", accuracy)