コード例 #1
0
def main(config):
    # Cookie cutter sequence of processes involved in running the 
    # necessary steps. Using the general pipeline outlined in Spark's
    # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html

    spark = spark_initiate()

    # some data / tramsformer
    raw_data = config['base']['train_df']
    structure_schema = model_structure()
    data = load_data(spark, raw_data, 'df', structure_schema)
    # data.show()

    df, cat_dict = transformer(data)
    datatype_dict = dict(df.dtypes)
    features = config['base']['featuresCol'].split(',')
    list_str = [] # list of string columns
    for feature in features:
        if datatype_dict[feature] == 'string':
            list_str.append(feature)
            df = StringIndexer(inputCol=feature, 
                               outputCol=feature + '_index'
                               ) \
                 .fit(df) \
                 .transform(df)
    df = df.drop(*list_str)
    df.show()
    features = list(set(df.columns) - set(config['base']['labelCol']))
    assembler = VectorAssembler(inputCols=features,
                                outputCol='features')
    df = assembler.transform(df)
    (trainingData, testData) = df.randomSplit([0.7, 0.3])

    # estimator

    model = estimators(config)
    fitted_model = model.fit(trainingData)
    testData = fitted_model.transform(testData)
    predictionAndLabels = testData.select('probability','Survived') \
                                  .rdd.map(lambda x: (float(x[0][0]),
                                                      float(x[1])
                                                      )
                                          )
    metrics = BinaryClassificationMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)

    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)
コード例 #2
0
sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed")
start_time = time.time()
modelClassifier = nb.fit(trainingData)
end_time = time.time()
print(end_time - start_time)

predictionsClassifier = modelClassifier.transform(testData)
evaluator = MulticlassClassificationEvaluator().setLabelCol(
    "indexed").setPredictionCol("prediction")
print(
    "accuracy = ",
    evaluator.evaluate(predictionsClassifier,
                       {evaluator.metricName: "accuracy"}))
コード例 #3
0
    inputCol="Survived",
    outputCol="indexedSurvived").fit(dataTitanic).transform(dataTitanic)

# One Hot Encoder on indexed features
dataTitanic = OneHotEncoder(inputCol="indexedSex",
                            outputCol="sexVec").transform(dataTitanic)
dataTitanic = OneHotEncoder(inputCol="indexedEmbarked",
                            outputCol="embarkedVec").transform(dataTitanic)

# Feature assembler as a vector
dataTitanic = VectorAssembler(
    inputCols=["Pclass", "sexVec", "embarkedVec", "Age", "SibSp", "Fare"],
    outputCol="features").transform(dataTitanic)

# Spliting in train and test set. Beware : It sorts the dataset
(trainDF, testDF) = dataTitanic.randomSplit([0.7, 0.3], seed=42)

rf = RandomForestClassifier(labelCol="indexedSurvived", featuresCol="features")

time_start = time.time()
model_rf = rf.fit(trainDF)

time_end = time.time()
time_rf = (time_end - time_start)
print("RF takes %d s" % (time_rf))

predictions = model_rf.transform(testDF)

# Select example rows to display.
predictions.select(
    col("prediction"),
コード例 #4
0
    coviddeath = spark.sql("SELECT * FROM uscasestemp1_csv")

# COMMAND ----------

data = coviddeath.select("Year", "Date", "Day", "Temp", "Lat", "Long",
                         "Admin2", "Province",
                         ((col("Case") > 2).cast("Double").alias("label")))
data = StringIndexer(inputCol='Admin2',
                     outputCol='Admin2' + "_index").fit(data).transform(data)
data = StringIndexer(inputCol='Province',
                     outputCol='Province' + "_index").fit(data).transform(data)
data.show(5)

# COMMAND ----------

splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
assembler = VectorAssembler(
    inputCols=["Day", "Temp", "Lat", "Province_index", "Admin2_index"],
    outputCol="normfeatures")
minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures")
featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features")
lr = LogisticRegression(labelCol="label",
コード例 #5
0
# Check first five records
flights.show(5)

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"
],
                            outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights = flights_assembled.select('features', 'xdelay')

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier(labelCol="xdelay")
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
predictions = prediction.select('xdelay', 'prediction', 'probability')

print(predictions.toPandas().sample(12))

spark.stop()
コード例 #6
0
               'hash_sim')
train_col = [
    'follow', 'pr_score_scr', 'pr_score_dst', 'jaccard', 'user_sim', 'hash_sim'
]
for i in train_col:
    df = df.withColumn(i, df[i].cast('float'))

assembler = VectorAssembler(inputCols=[
    'pr_score_scr', 'pr_score_dst', 'jaccard', 'user_sim', 'hash_sim'
],
                            outputCol="features")
df = assembler.transform(df)
df = StringIndexer(inputCol="follow",
                   outputCol="label").fit(df).transform(df).select(
                       'features', 'label')

print('Split train and test dataset...')
train_df, test_df = df.randomSplit([0.7, 0.3], seed=0)

print('Train RandomForest model...')
rf = RandomForestClassifier(numTrees=10, maxDepth=5, labelCol="label", seed=0)
model = rf.fit(train_df)

print('Evaluation...')
prediction = model.transform(test_df).select('label', 'probability',
                                             'prediction')
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability")
print('Test Area Under ROC', evaluator.evaluate(prediction))

sc.stop()
コード例 #7
0
    'carrier_idx', 
    'org_idx',
    'km',
    'depart',
    'duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

# Split train/test
# Specify a seed for reproducibility
cars_train, cars_test = cars.randomSplit([0.8, 0.2], seed=23)
print([car_train.count(), cars_test.count()])

# Build a Decision Tree model
from pyspark.ml.classification import DecisionTreeClassifier
# Create a Decision Tree classifier
tree = DecisionTreeClassifier()
# Learn from the training data
tree_model = tree.fit(cars_train)
# Evaluating
prediction = tree_model.transform(cars_test)
# Confusion matrix: a table describes performance of a model on testing data
prediction.groupBy('label', 'prediction').count().show()
# Accuracy = (TN + TP) / (TN + TP + FN + FP) - proportion of correct predictions

# Split into training and testing sets in a 80:20 ratio
コード例 #8
0
ファイル: main.py プロジェクト: jwzcheng/survey
                         outputCol=i + "_Vec").transform(data)
    test = OneHotEncoder(dropLast=False, inputCol=i,
                         outputCol=i + "_Vec").transform(test)

assembler_input_col = []
for i in encoder_input_col:
    assembler_input_col.append(i + '_Vec')

# # Assembel all features into 'features'
data = VectorAssembler(inputCols=assembler_input_col,
                       outputCol='features').transform(data)
test = VectorAssembler(inputCols=assembler_input_col,
                       outputCol='features').transform(test)

# # Split the data into training and test sets (30% held out for testing)
(data_train, data_test) = data.randomSplit([0.7, 0.3])

print('==TRAINING MODEL== \n')

rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="features",
                            numTrees=10)

model = rf.fit(data_train)

predictions = model.transform(test)

test_predictions = model.transform(test)

# Select example rows to display.
predictions.select(["prediction", "probability"]).show(5)