Example #1
0
def LearningCurve(df, target):

    df_t = df
    string_cols = []
    for (a, b) in df.dtypes:
        if b == 'string' and a != target:
            string_cols.append(a)

    num_cols = [x for x in df.columns if x not in string_cols and x != target]
    encoded_cols = [x + "_index" for x in string_cols]

    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
        for column in string_cols
    ]
    pipeline = Pipeline(stages=indexers)
    df_t = pipeline.fit(df_t).transform(df_t)

    cols_now = num_cols + encoded_cols
    assembler_features = VectorAssembler(inputCols=cols_now,
                                         outputCol='features')
    labelIndexer = StringIndexer(inputCol=target, outputCol="label")
    tmp = [assembler_features, labelIndexer]
    pipeline = Pipeline(stages=tmp)
    df_t = pipeline.fit(df_t).transform(df_t)
    df_t.cache()
    trainingData, testData = df_t.randomSplit([0.7, 0.3], seed=0)

    rf = RF(labelCol='label', featuresCol='features', numTrees=200)
    plot_points = []

    #Variable to be adjusted for increment in data%
    step_var = 10

    for i in range(step_var, 101, step_var):

        sample_size = (i * trainingData.count()) / 100
        part_Data = trainingData.rdd.takeSample(False, sample_size, seed=i)
        part_Data = sqlContext.createDataFrame(part_Data)

        model = rf.fit(part_Data)
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label",
            predictionCol="prediction",
            metricName="accuracy")

        #Calculating train error
        transformed = model.transform(part_Data)
        train_accuracy = evaluator.evaluate(transformed)
        train_error = 1 - train_accuracy

        #Calculating test error
        transformed = model.transform(testData)
        test_accuracy = evaluator.evaluate(transformed)
        test_error = 1 - test_accuracy

        plot_points.append([i, train_error, test_error])

    return plot_points
Example #2
0
def rf(df):
    trainingData, testData = df.randomSplit([0.7, 0.3], seed=0)
    rf = RF(labelCol='label', featuresCol='features', numTrees=100)
    fit = rf.fit(trainingData)
    # featureImp = fit.featureImportances
    fit.save("s3a://ffinsight/model_rf")
    prediction = fit.transform(testData)
    return prediction
Example #3
0
def FeatureImportances(df, target, FeatureImportance_cutoff):

    df_t = df
    string_cols = []
    for (a, b) in df.dtypes:
        if b == 'string' and a != target:
            string_cols.append(a)

    num_cols = [x for x in df.columns if x not in string_cols and x != target]
    encoded_cols = [x + "_index" for x in string_cols]

    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
        for column in string_cols
    ]
    pipeline = Pipeline(stages=indexers)
    df_t = pipeline.fit(df_t).transform(df_t)

    cols_now = num_cols + encoded_cols
    assembler_features = VectorAssembler(inputCols=cols_now,
                                         outputCol='features')
    labelIndexer = StringIndexer(inputCol=target, outputCol="label")
    tmp = [assembler_features, labelIndexer]
    pipeline = Pipeline(stages=tmp)
    df_t = pipeline.fit(df_t).transform(df_t)
    df_t.cache()
    trainingData, testData = df_t.randomSplit([0.8, 0.2], seed=0)

    rf = RF(labelCol='label', featuresCol='features', numTrees=200)
    model = rf.fit(trainingData)
    feat_imp = dict()
    vi = model.featureImportances
    no_of_cols = len(cols_now)
    cols_actual = num_cols + string_cols

    for i in range(no_of_cols):
        feat_imp[cols_actual[i]] = vi[i]

    return feat_imp
def transform(df):
    cols = df.columns
    cols.remove('Severity')
    vecAssembler = VectorAssembler(inputCols=cols, outputCol="features")
    df_transformed = vecAssembler.transform(df)
    return df_transformed


def evaluate_model(df):
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol='Severity')
    accuracy_rf = evaluator.evaluate(df)
    return accuracy_rf


preprocessed_df = preprocessing(accident_df)
indexed_df = indexing(preprocessed_df)
transformed_df = transform(indexed_df)
#Split data into Training and Testing
train, test = transformed_df.randomSplit([0.7, 0.3], seed=2000)
#Using Random Forest Algorithm
rf = RF(featuresCol='features',
        numTrees=12,
        maxDepth=16,
        labelCol="Severity",
        maxBins=150)
model_rf = rf.fit(train)
#Predicting on test data
prediction_rf = model_rf.transform(test)
accuracy = evaluate_model(prediction_rf)
print("Accuracy is ", accuracy)
feature_columns.remove('status')
data.toPandas().to_csv("new_data.csv")
assembler_features = VectorAssembler(inputCols=feature_columns,
                                     outputCol='features')
prediction_column = StringIndexer(inputCol='status', outputCol='label')
tmp = [assembler_features, prediction_column]

pipeline = Pipeline(stages=tmp)
sum = 0.0

for i in range(10):
    all_data = pipeline.fit(data).transform(data)
    (training_data, test_data) = all_data.randomSplit([0.8, 0.2])
    test_data.drop('status')

    rf = RF(labelCol='label', featuresCol='features', numTrees=200)

    fit = rf.fit(training_data)
    transformed = fit.transform(test_data)

    results = transformed.select(['probability', 'label'])

    results_collect = results.collect()
    result_list = [(float(i[0][0]), 1.0 - float(i[1]))
                   for i in results_collect]
    scores = sc.parallelize(result_list)

    meterics = meteric(scores)
    sum = sum + meterics.areaUnderROC

print(sum / 10)
Example #6
0
assembler = VectorAssembler(inputCols=cols_out, outputCol="features")

labelIndexer = StringIndexer(inputCol="y", outputCol="label")

tmp += [assembler, labelIndexer]

pipeline = Pipeline(stages=tmp)

allData = pipeline.fit(trainDF).transform(trainDF)

allData.cache()

trainData, validData = allData.randomSplit([0.8, 0.2], seed=1)

randforest = RF(labelCol="label", featuresCol="features", numTrees=100)

rf_fit = randforest.fit(trainData)

transformed = rf_fit.transform(validData)

results = transformed.select(["probability", "label"])

results_collect = results.collect()

results_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect]

score = sc.parallelize(results_list)

metrics = metric(score)
def SuggestIntervention(subjectid):

    print("[v] -> PySpark SQL")
    SPARK_URL = "local[*]"
    spark = SparkSession.builder.appName("").master(SPARK_URL).getOrCreate()
    sc = spark.sparkContext
    #Hier onze data importeren
    #collomen benoemd gebasseerd op query
    sqlcontext = SQLContext(sc)
    columns = [
        'itvInterventieOptieId', 'itvRegieParentId', 'sjId', 'sjGender',
        'sjDateOfBirth', 'sjMaritalStatusId', 'sjWoonplaatsId', 'casId',
        'casClassification', 'casThemaGebiedId', 'lgscoreRegieParentId',
        'lgscoreScore', 'probProbleemOptieId', 'itvGoalReached',
        'itvGeresidiveerd'
    ]

    # DATA OPHALEN VAN ITV ID
    database = igDatabaseModule.Database("PGA_HRO")
    subjectdata = database.get_data_from_the_database(
        'SELECT distinct intoptid, sjId ,sjGender,sjDateOfBirth,sjMaritalStatusId,sjWoonplaatsId,casId,casClassification,casThemaGebiedId,lgscoreRegieParentId,lgscoreScore FROM tblSubject, tblMeldingZSMPlus, tblCasus, tblLeefgebiedScore, (select * from tblInterventieOptie where intoptIsInactive = 0) as interventies where zsmSubjectId = sjid and zsmCasusId = casId and lgscoreRegieParentId = casId and casThemaGebiedId is not null and sjMaritalStatusId is not null and sjWoonplaatsId is not Null and sjid = {};'
        .format(subjectid))
    pandadf = pandas.DataFrame(subjectdata,
                               columns=[
                                   'itvInterventieOptieId', 'sjId', 'sjGender',
                                   'sjDateOfBirth', 'sjMaritalStatusId',
                                   'sjWoonplaatsId', 'casId',
                                   'casClassification', 'casThemaGebiedId',
                                   'lgscoreRegieParentId', 'lgscoreScore'
                               ])

    subjectDF = sqlcontext.createDataFrame(pandadf)

    goalreacheudf = udf(lambda x: x, IntegerType())
    subjectDF = subjectDF.withColumn("itvGoalReached", lit(1))
    subjectDF = subjectDF.withColumn("itvGeresidiveerd", lit("Nee"))
    print("[v] -> parallelize")

    csvpath = "c:\FinalCSVnoNull.csv"
    dataDF = spark.read.options(header="true", inferschema="true").csv(csvpath)
    print("[v] -> csv")
    dataDF = dataDF.select([
        'itvInterventieOptieId', 'sjId', 'sjGender', 'sjDateOfBirth',
        'sjMaritalStatusId', 'sjWoonplaatsId', 'casId', 'casClassification',
        'casThemaGebiedId', 'lgscoreRegieParentId', 'lgscoreScore',
        'itvGoalReached', 'itvGeresidiveerd'
    ])

    def calculate_age(born):
        born = datetime.datetime.strptime(born, "%Y-%m-%d %H:%M:%S").date()
        today = date.today()
        return today.year - born.year - ((today.month, today.day) <
                                         (born.month, born.day))

    def calculate_age_subject(born):
        born = datetime.datetime.strptime(born, "%Y-%m-%d").date()
        today = date.today()
        return today.year - born.year - ((today.month, today.day) <
                                         (born.month, born.day))

    calculate_age_udf = udf(calculate_age, IntegerType())
    calculate_age_udf_subject = udf(calculate_age_subject, IntegerType())

    dataDF = dataDF.withColumn(
        "sjAge", calculate_age_udf(dataDF.sjDateOfBirth.cast("string")))

    subjectDF = subjectDF.withColumn(
        "sjAge",
        calculate_age_udf_subject(subjectDF.sjDateOfBirth.cast("string")))
    print("[v] -> ageconvertion")

    df = dataDF.union(subjectDF)

    #onehot encoding zorgt ervoor dat dingen die niet int zijn worden omgezet naar getallen die onderscheiden kunnen worden
    column_vec_in = [
        'itvInterventieOptieId', 'sjGender', 'sjMaritalStatusId',
        'sjWoonplaatsId', 'casClassification', 'casThemaGebiedId',
        'lgscoreRegieParentId', 'lgscoreScore'
    ]
    column_vec_out = [
        'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec',
        'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec',
        'lgscoreRegieParentIdvec', 'lgscoreScorevec'
    ]
    indexers = [
        StringIndexer(inputCol=x, outputCol=x + '_tmp') for x in column_vec_in
    ]
    encoders = [
        OneHotEncoder(dropLast=False, inputCol=x + "_tmp", outputCol=y)
        for x, y in zip(column_vec_in, column_vec_out)
    ]
    tmp = [[i, j] for i, j in zip(indexers, encoders)]
    tmp = [i for sublist in tmp for i in sublist]
    print("[v] -> onehotencoding")

    #finalize with pipeline
    cols_now = [
        'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec',
        'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec',
        'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached', 'sjAge'
    ]
    assembler_features = VectorAssembler(inputCols=cols_now,
                                         outputCol='parameters')
    labelIndexer = StringIndexer(inputCol='itvGeresidiveerd',
                                 outputCol="resultintervention")
    tmp += [assembler_features, labelIndexer]
    pipeline = Pipeline(stages=tmp)

    allData = pipeline.fit(df).transform(df)
    allData = allData.select([
        'itvInterventieOptieId', 'sjId', 'itvInterventieOptieIdvec',
        'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec',
        'casClassificationvec', 'casThemaGebiedIdvec',
        'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached',
        'sjAge', 'parameters', "resultintervention"
    ])
    print("[v] -> pipeline")
    allData.cache()
    print(str(allData.count()))
    print("[v] -> trainingcashe")

    trainingData = allData.filter("not sjId = {}".format(subjectid))
    testData = allData.filter("sjId = {}".format(subjectid))

    trainingData = trainingData.select([
        'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec',
        'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec',
        'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached',
        'sjAge', 'parameters', "resultintervention"
    ])
    testc = testData.select([
        'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec',
        'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec',
        'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached',
        'sjAge', 'parameters', "resultintervention"
    ])

    print("traindata amount" + str(trainingData.count()))
    print("testdata amount" + str(testData.count()))

    rf = RF(labelCol='resultintervention',
            featuresCol='parameters',
            numTrees=200)
    rfit = rf.fit(trainingData)
    transformed = rfit.transform(testData)
    print("hierzo")
    transformed.select(['itvInterventieOptieId', 'itvInterventieOptieIdvec'])

    #dit moet gereturnend worden!!!!!!!!!!
    results = transformed.select(
        ['itvInterventieOptieId', 'probability', 'prediction'])
    #dit ^^^^^^^^^^^^^^^^^^^
    resultslist = results.toJSON()
    return resultslist
# Create dataframe with predictions and show example
labelAndPrediction = pred_toy_DT.select("label", "row", "prediction",
                                        "features")
display(labelAndPrediction.where(labelAndPrediction.row == 575525629176))

# COMMAND ----------

# MAGIC %md ##### RF Algorithm
# MAGIC The method of averaging many trees grown from repeated samples of the training data, or bagging, decreases variance of the model that would occur with any one tree. Bagging grows deep trees and does not prune. The RF training method goes a step further to help guarantee a more reliable result. RF trees are built such that each node is randomly assigned a subset of features that will be considered as possible split candidates. This means that the trees will differ from each other, which when averaged will decrease variance more than bagging alone.
# MAGIC
# MAGIC Below we will train a RF model on the same data using 3 trees.

# COMMAND ----------

# RF model
rf = RF(labelCol="label", featuresCol="features", numTrees=3, maxDepth=5)
RF_model = rf.fit(train_toy)

# COMMAND ----------

# Print tree nodes for all RF trees
print(RF_model.toDebugString)

# COMMAND ----------

# MAGIC %md ##### Prediction with RF
# MAGIC RF then combines these predictions for all trees using a majority vote. If \\(\hat{p}\_{n,k}\\) is the proportion of predictions for class \\(k\\) over \\(n\\) trees, the majority vote is \\(argmax\_k\\) \\(\hat{p}\_{n,k}\\).

# COMMAND ----------

# Predict on toy test set with RF
dtreeeClassifer = DecisionTreeClassifier(maxDepth=2,
                                         labelCol="SPECIES_Catogery",
                                         featuresCol="features")
dtreeModel = dtreeeClassifer.fit(train_df)

predictions = dtreeModel.transform(test_df)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              labelCol="SPECIES_Catogery",
                                              metricName="accuracy")
print(evaluator.evaluate(predictions))

predictions.groupBy("SPECIES_Catogery", "prediction").count().show()

###########################----RANDOM FOREST CLASSIFIER----##########################################33

iris_rf = RF(labelCol='SPECIES_Catogery', featuresCol='features', numTrees=200)
fit = iris_rf.fit(train_df)
prediction_rf = fit.transform(test_df)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              labelCol="SPECIES_Catogery",
                                              metricName="accuracy")
print(evaluator.evaluate(prediction_rf))

predictionAndLabels = prediction_rf.select(['prediction', 'SPECIES_Catogery'])

metrics = MulticlassMetrics(predictionAndLabels.rdd)
confusion_mat = metrics.confusionMatrix()
print(confusion_mat)
Example #10
0
    def Train(self):
        st = time.time()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        pipeline_filepath = model_path + "/RandomForest/TrainedModels/pipeline"
        model_filepath = model_path + "/RandomForest/TrainedModels/model"
        summary_filepath = model_path + "/RandomForest/ModelSummary/summary.json"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)
        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath)
        trainingData, validationData = MLUtils.get_training_and_validation_data(
            indexed, result_column, 0.8)
        OriginalTargetconverter = IndexToString(
            inputCol="label", outputCol="originalTargetColumn")
        rf = RF(labelCol='label', featuresCol='features', numTrees=200)
        fit = rf.fit(trainingData)
        transformed = fit.transform(validationData)
        MLUtils.save_pipeline_or_model(fit, model_filepath)
        feature_importance = MLUtils.calculate_sparkml_feature_importance(
            indexed, fit, categorical_columns, numerical_columns)

        label_classes = transformed.select("label").distinct().collect()
        results = transformed.select(["prediction", "label"])
        if len(label_classes) > 2:
            evaluator = MulticlassClassificationEvaluator(
                predictionCol="prediction")
            evaluator.evaluate(results)
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "accuracy"})  # accuracy of the model
        else:
            evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="prediction")
            evaluator.evaluate(results)
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"})
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"})
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "areaUnderPR"})  # accuracy of the model

        self._model_summary[
            "feature_importance"] = MLUtils.transform_feature_importance(
                feature_importance)
        self._model_summary["runtime_in_seconds"] = round((time.time() - st),
                                                          2)

        transformed = OriginalTargetconverter.transform(transformed)
        label_indexer_dict = [
            dict(enumerate(field.metadata["ml_attr"]["vals"]))
            for field in transformed.schema.fields if field.name == "label"
        ][0]
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            "predictedClass", prediction_to_levels(transformed.prediction))
        prediction_df = transformed.select(
            ["originalTargetColumn", "predictedClass"]).toPandas()
        objs = {
            "actual": prediction_df["originalTargetColumn"],
            "predicted": prediction_df["predictedClass"]
        }

        self._model_summary[
            "confusion_matrix"] = MLUtils.calculate_confusion_matrix(
                objs["actual"], objs["predicted"])
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"])
        self._model_summary[
            "precision_recall_stats"] = overall_precision_recall[
                "classwise_stats"]
        self._model_summary["model_precision"] = overall_precision_recall[
            "precision"]
        self._model_summary["model_recall"] = overall_precision_recall[
            "recall"]
        self._model_summary["target_variable"] = result_column
        self._model_summary[
            "test_sample_prediction"] = overall_precision_recall[
                "prediction_split"]
        self._model_summary["algorithm_name"] = "Random Forest"
        self._model_summary["validation_method"] = "Train and Test"
        self._model_summary["independent_variables"] = len(
            categorical_columns) + len(numerical_columns)
        self._model_summary["level_counts"] = CommonUtils.get_level_count_dict(
            trainingData,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            dataType="spark")
        # print json.dumps(self._model_summary,indent=2)
        self._model_summary["total_trees"] = 100
        self._model_summary["total_rules"] = 300
        CommonUtils.write_to_file(
            summary_filepath, json.dumps({"modelSummary":
                                          self._model_summary}))