def LearningCurve(df, target): df_t = df string_cols = [] for (a, b) in df.dtypes: if b == 'string' and a != target: string_cols.append(a) num_cols = [x for x in df.columns if x not in string_cols and x != target] encoded_cols = [x + "_index" for x in string_cols] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in string_cols ] pipeline = Pipeline(stages=indexers) df_t = pipeline.fit(df_t).transform(df_t) cols_now = num_cols + encoded_cols assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features') labelIndexer = StringIndexer(inputCol=target, outputCol="label") tmp = [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) df_t = pipeline.fit(df_t).transform(df_t) df_t.cache() trainingData, testData = df_t.randomSplit([0.7, 0.3], seed=0) rf = RF(labelCol='label', featuresCol='features', numTrees=200) plot_points = [] #Variable to be adjusted for increment in data% step_var = 10 for i in range(step_var, 101, step_var): sample_size = (i * trainingData.count()) / 100 part_Data = trainingData.rdd.takeSample(False, sample_size, seed=i) part_Data = sqlContext.createDataFrame(part_Data) model = rf.fit(part_Data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") #Calculating train error transformed = model.transform(part_Data) train_accuracy = evaluator.evaluate(transformed) train_error = 1 - train_accuracy #Calculating test error transformed = model.transform(testData) test_accuracy = evaluator.evaluate(transformed) test_error = 1 - test_accuracy plot_points.append([i, train_error, test_error]) return plot_points
def rf(df): trainingData, testData = df.randomSplit([0.7, 0.3], seed=0) rf = RF(labelCol='label', featuresCol='features', numTrees=100) fit = rf.fit(trainingData) # featureImp = fit.featureImportances fit.save("s3a://ffinsight/model_rf") prediction = fit.transform(testData) return prediction
def FeatureImportances(df, target, FeatureImportance_cutoff): df_t = df string_cols = [] for (a, b) in df.dtypes: if b == 'string' and a != target: string_cols.append(a) num_cols = [x for x in df.columns if x not in string_cols and x != target] encoded_cols = [x + "_index" for x in string_cols] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in string_cols ] pipeline = Pipeline(stages=indexers) df_t = pipeline.fit(df_t).transform(df_t) cols_now = num_cols + encoded_cols assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features') labelIndexer = StringIndexer(inputCol=target, outputCol="label") tmp = [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) df_t = pipeline.fit(df_t).transform(df_t) df_t.cache() trainingData, testData = df_t.randomSplit([0.8, 0.2], seed=0) rf = RF(labelCol='label', featuresCol='features', numTrees=200) model = rf.fit(trainingData) feat_imp = dict() vi = model.featureImportances no_of_cols = len(cols_now) cols_actual = num_cols + string_cols for i in range(no_of_cols): feat_imp[cols_actual[i]] = vi[i] return feat_imp
def transform(df): cols = df.columns cols.remove('Severity') vecAssembler = VectorAssembler(inputCols=cols, outputCol="features") df_transformed = vecAssembler.transform(df) return df_transformed def evaluate_model(df): evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Severity') accuracy_rf = evaluator.evaluate(df) return accuracy_rf preprocessed_df = preprocessing(accident_df) indexed_df = indexing(preprocessed_df) transformed_df = transform(indexed_df) #Split data into Training and Testing train, test = transformed_df.randomSplit([0.7, 0.3], seed=2000) #Using Random Forest Algorithm rf = RF(featuresCol='features', numTrees=12, maxDepth=16, labelCol="Severity", maxBins=150) model_rf = rf.fit(train) #Predicting on test data prediction_rf = model_rf.transform(test) accuracy = evaluate_model(prediction_rf) print("Accuracy is ", accuracy)
feature_columns.remove('status') data.toPandas().to_csv("new_data.csv") assembler_features = VectorAssembler(inputCols=feature_columns, outputCol='features') prediction_column = StringIndexer(inputCol='status', outputCol='label') tmp = [assembler_features, prediction_column] pipeline = Pipeline(stages=tmp) sum = 0.0 for i in range(10): all_data = pipeline.fit(data).transform(data) (training_data, test_data) = all_data.randomSplit([0.8, 0.2]) test_data.drop('status') rf = RF(labelCol='label', featuresCol='features', numTrees=200) fit = rf.fit(training_data) transformed = fit.transform(test_data) results = transformed.select(['probability', 'label']) results_collect = results.collect() result_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect] scores = sc.parallelize(result_list) meterics = meteric(scores) sum = sum + meterics.areaUnderROC print(sum / 10)
assembler = VectorAssembler(inputCols=cols_out, outputCol="features") labelIndexer = StringIndexer(inputCol="y", outputCol="label") tmp += [assembler, labelIndexer] pipeline = Pipeline(stages=tmp) allData = pipeline.fit(trainDF).transform(trainDF) allData.cache() trainData, validData = allData.randomSplit([0.8, 0.2], seed=1) randforest = RF(labelCol="label", featuresCol="features", numTrees=100) rf_fit = randforest.fit(trainData) transformed = rf_fit.transform(validData) results = transformed.select(["probability", "label"]) results_collect = results.collect() results_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect] score = sc.parallelize(results_list) metrics = metric(score)
def SuggestIntervention(subjectid): print("[v] -> PySpark SQL") SPARK_URL = "local[*]" spark = SparkSession.builder.appName("").master(SPARK_URL).getOrCreate() sc = spark.sparkContext #Hier onze data importeren #collomen benoemd gebasseerd op query sqlcontext = SQLContext(sc) columns = [ 'itvInterventieOptieId', 'itvRegieParentId', 'sjId', 'sjGender', 'sjDateOfBirth', 'sjMaritalStatusId', 'sjWoonplaatsId', 'casId', 'casClassification', 'casThemaGebiedId', 'lgscoreRegieParentId', 'lgscoreScore', 'probProbleemOptieId', 'itvGoalReached', 'itvGeresidiveerd' ] # DATA OPHALEN VAN ITV ID database = igDatabaseModule.Database("PGA_HRO") subjectdata = database.get_data_from_the_database( 'SELECT distinct intoptid, sjId ,sjGender,sjDateOfBirth,sjMaritalStatusId,sjWoonplaatsId,casId,casClassification,casThemaGebiedId,lgscoreRegieParentId,lgscoreScore FROM tblSubject, tblMeldingZSMPlus, tblCasus, tblLeefgebiedScore, (select * from tblInterventieOptie where intoptIsInactive = 0) as interventies where zsmSubjectId = sjid and zsmCasusId = casId and lgscoreRegieParentId = casId and casThemaGebiedId is not null and sjMaritalStatusId is not null and sjWoonplaatsId is not Null and sjid = {};' .format(subjectid)) pandadf = pandas.DataFrame(subjectdata, columns=[ 'itvInterventieOptieId', 'sjId', 'sjGender', 'sjDateOfBirth', 'sjMaritalStatusId', 'sjWoonplaatsId', 'casId', 'casClassification', 'casThemaGebiedId', 'lgscoreRegieParentId', 'lgscoreScore' ]) subjectDF = sqlcontext.createDataFrame(pandadf) goalreacheudf = udf(lambda x: x, IntegerType()) subjectDF = subjectDF.withColumn("itvGoalReached", lit(1)) subjectDF = subjectDF.withColumn("itvGeresidiveerd", lit("Nee")) print("[v] -> parallelize") csvpath = "c:\FinalCSVnoNull.csv" dataDF = spark.read.options(header="true", inferschema="true").csv(csvpath) print("[v] -> csv") dataDF = dataDF.select([ 'itvInterventieOptieId', 'sjId', 'sjGender', 'sjDateOfBirth', 'sjMaritalStatusId', 'sjWoonplaatsId', 'casId', 'casClassification', 'casThemaGebiedId', 'lgscoreRegieParentId', 'lgscoreScore', 'itvGoalReached', 'itvGeresidiveerd' ]) def calculate_age(born): born = datetime.datetime.strptime(born, "%Y-%m-%d %H:%M:%S").date() today = date.today() return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) def calculate_age_subject(born): born = datetime.datetime.strptime(born, "%Y-%m-%d").date() today = date.today() return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) calculate_age_udf = udf(calculate_age, IntegerType()) calculate_age_udf_subject = udf(calculate_age_subject, IntegerType()) dataDF = dataDF.withColumn( "sjAge", calculate_age_udf(dataDF.sjDateOfBirth.cast("string"))) subjectDF = subjectDF.withColumn( "sjAge", calculate_age_udf_subject(subjectDF.sjDateOfBirth.cast("string"))) print("[v] -> ageconvertion") df = dataDF.union(subjectDF) #onehot encoding zorgt ervoor dat dingen die niet int zijn worden omgezet naar getallen die onderscheiden kunnen worden column_vec_in = [ 'itvInterventieOptieId', 'sjGender', 'sjMaritalStatusId', 'sjWoonplaatsId', 'casClassification', 'casThemaGebiedId', 'lgscoreRegieParentId', 'lgscoreScore' ] column_vec_out = [ 'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec', 'lgscoreRegieParentIdvec', 'lgscoreScorevec' ] indexers = [ StringIndexer(inputCol=x, outputCol=x + '_tmp') for x in column_vec_in ] encoders = [ OneHotEncoder(dropLast=False, inputCol=x + "_tmp", outputCol=y) for x, y in zip(column_vec_in, column_vec_out) ] tmp = [[i, j] for i, j in zip(indexers, encoders)] tmp = [i for sublist in tmp for i in sublist] print("[v] -> onehotencoding") #finalize with pipeline cols_now = [ 'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec', 'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached', 'sjAge' ] assembler_features = VectorAssembler(inputCols=cols_now, outputCol='parameters') labelIndexer = StringIndexer(inputCol='itvGeresidiveerd', outputCol="resultintervention") tmp += [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) allData = pipeline.fit(df).transform(df) allData = allData.select([ 'itvInterventieOptieId', 'sjId', 'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec', 'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached', 'sjAge', 'parameters', "resultintervention" ]) print("[v] -> pipeline") allData.cache() print(str(allData.count())) print("[v] -> trainingcashe") trainingData = allData.filter("not sjId = {}".format(subjectid)) testData = allData.filter("sjId = {}".format(subjectid)) trainingData = trainingData.select([ 'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec', 'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached', 'sjAge', 'parameters', "resultintervention" ]) testc = testData.select([ 'itvInterventieOptieIdvec', 'sjGendervec', 'sjMaritalStatusIdvec', 'sjWoonplaatsIdvec', 'casClassificationvec', 'casThemaGebiedIdvec', 'lgscoreRegieParentIdvec', 'lgscoreScorevec', 'itvGoalReached', 'sjAge', 'parameters', "resultintervention" ]) print("traindata amount" + str(trainingData.count())) print("testdata amount" + str(testData.count())) rf = RF(labelCol='resultintervention', featuresCol='parameters', numTrees=200) rfit = rf.fit(trainingData) transformed = rfit.transform(testData) print("hierzo") transformed.select(['itvInterventieOptieId', 'itvInterventieOptieIdvec']) #dit moet gereturnend worden!!!!!!!!!! results = transformed.select( ['itvInterventieOptieId', 'probability', 'prediction']) #dit ^^^^^^^^^^^^^^^^^^^ resultslist = results.toJSON() return resultslist
# Create dataframe with predictions and show example labelAndPrediction = pred_toy_DT.select("label", "row", "prediction", "features") display(labelAndPrediction.where(labelAndPrediction.row == 575525629176)) # COMMAND ---------- # MAGIC %md ##### RF Algorithm # MAGIC The method of averaging many trees grown from repeated samples of the training data, or bagging, decreases variance of the model that would occur with any one tree. Bagging grows deep trees and does not prune. The RF training method goes a step further to help guarantee a more reliable result. RF trees are built such that each node is randomly assigned a subset of features that will be considered as possible split candidates. This means that the trees will differ from each other, which when averaged will decrease variance more than bagging alone. # MAGIC # MAGIC Below we will train a RF model on the same data using 3 trees. # COMMAND ---------- # RF model rf = RF(labelCol="label", featuresCol="features", numTrees=3, maxDepth=5) RF_model = rf.fit(train_toy) # COMMAND ---------- # Print tree nodes for all RF trees print(RF_model.toDebugString) # COMMAND ---------- # MAGIC %md ##### Prediction with RF # MAGIC RF then combines these predictions for all trees using a majority vote. If \\(\hat{p}\_{n,k}\\) is the proportion of predictions for class \\(k\\) over \\(n\\) trees, the majority vote is \\(argmax\_k\\) \\(\hat{p}\_{n,k}\\). # COMMAND ---------- # Predict on toy test set with RF
dtreeeClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="SPECIES_Catogery", featuresCol="features") dtreeModel = dtreeeClassifer.fit(train_df) predictions = dtreeModel.transform(test_df) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="SPECIES_Catogery", metricName="accuracy") print(evaluator.evaluate(predictions)) predictions.groupBy("SPECIES_Catogery", "prediction").count().show() ###########################----RANDOM FOREST CLASSIFIER----##########################################33 iris_rf = RF(labelCol='SPECIES_Catogery', featuresCol='features', numTrees=200) fit = iris_rf.fit(train_df) prediction_rf = fit.transform(test_df) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="SPECIES_Catogery", metricName="accuracy") print(evaluator.evaluate(prediction_rf)) predictionAndLabels = prediction_rf.select(['prediction', 'SPECIES_Catogery']) metrics = MulticlassMetrics(predictionAndLabels.rdd) confusion_mat = metrics.confusionMatrix() print(confusion_mat)
def Train(self): st = time.time() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() pipeline_filepath = model_path + "/RandomForest/TrainedModels/pipeline" model_filepath = model_path + "/RandomForest/TrainedModels/model" summary_filepath = model_path + "/RandomForest/ModelSummary/summary.json" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath) trainingData, validationData = MLUtils.get_training_and_validation_data( indexed, result_column, 0.8) OriginalTargetconverter = IndexToString( inputCol="label", outputCol="originalTargetColumn") rf = RF(labelCol='label', featuresCol='features', numTrees=200) fit = rf.fit(trainingData) transformed = fit.transform(validationData) MLUtils.save_pipeline_or_model(fit, model_filepath) feature_importance = MLUtils.calculate_sparkml_feature_importance( indexed, fit, categorical_columns, numerical_columns) label_classes = transformed.select("label").distinct().collect() results = transformed.select(["prediction", "label"]) if len(label_classes) > 2: evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") evaluator.evaluate(results) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "accuracy"}) # accuracy of the model else: evaluator = BinaryClassificationEvaluator( rawPredictionCol="prediction") evaluator.evaluate(results) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"}) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"}) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "areaUnderPR"}) # accuracy of the model self._model_summary[ "feature_importance"] = MLUtils.transform_feature_importance( feature_importance) self._model_summary["runtime_in_seconds"] = round((time.time() - st), 2) transformed = OriginalTargetconverter.transform(transformed) label_indexer_dict = [ dict(enumerate(field.metadata["ml_attr"]["vals"])) for field in transformed.schema.fields if field.name == "label" ][0] prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( "predictedClass", prediction_to_levels(transformed.prediction)) prediction_df = transformed.select( ["originalTargetColumn", "predictedClass"]).toPandas() objs = { "actual": prediction_df["originalTargetColumn"], "predicted": prediction_df["predictedClass"] } self._model_summary[ "confusion_matrix"] = MLUtils.calculate_confusion_matrix( objs["actual"], objs["predicted"]) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"]) self._model_summary[ "precision_recall_stats"] = overall_precision_recall[ "classwise_stats"] self._model_summary["model_precision"] = overall_precision_recall[ "precision"] self._model_summary["model_recall"] = overall_precision_recall[ "recall"] self._model_summary["target_variable"] = result_column self._model_summary[ "test_sample_prediction"] = overall_precision_recall[ "prediction_split"] self._model_summary["algorithm_name"] = "Random Forest" self._model_summary["validation_method"] = "Train and Test" self._model_summary["independent_variables"] = len( categorical_columns) + len(numerical_columns) self._model_summary["level_counts"] = CommonUtils.get_level_count_dict( trainingData, categorical_columns, self._dataframe_context.get_column_separator(), dataType="spark") # print json.dumps(self._model_summary,indent=2) self._model_summary["total_trees"] = 100 self._model_summary["total_rules"] = 300 CommonUtils.write_to_file( summary_filepath, json.dumps({"modelSummary": self._model_summary}))