def __pipeline(self, modeling_code: str, classifiers_metadata: dict, database_url_training: str, database_url_test: str) -> None: (features_training, features_testing, features_evaluation) = \ self.__modeling_code_processing( modeling_code, self.__spark_session, database_url_training, database_url_test) classifier_switcher = { "LR": LogisticRegression(), "DT": DecisionTreeClassifier(), "RF": RandomForestClassifier(), "GB": GBTClassifier(), "NB": NaiveBayes(), } classifier_threads = [] for name, metadata in classifiers_metadata.items(): classifier = classifier_switcher[name] classifier_threads.append( self.__thread_pool.submit( self.__classifier_processing, classifier, features_training, features_testing, features_evaluation, metadata, )) for classifier in classifier_threads: testing_prediction, metadata_document = classifier.result() self.__save_classifier_result(testing_prediction, metadata_document)
def test_sklearn_decision_tree_multiclass(): import shap from sklearn.tree import DecisionTreeClassifier import numpy as np X, y = shap.datasets.iris() y[y == 2] = 1 model = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0) model.fit(X, y) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) assert np.abs(shap_values[0][0, 0] - 0.05) < 1e-1 assert np.abs(shap_values[1][0, 0] + 0.05) < 1e-1
def entrenamiento(df): # Vectorizo df = df.select("Finishing", "ShortPassing", "BallControl", "Stamina", "SlidingTackle", "GKReflexes", "Crossing", "Agility", "Position", "Dribbling", "SprintSpeed") assembler = VectorAssembler( inputCols=["Finishing", "ShortPassing", "BallControl", "Stamina", "SlidingTackle", "GKReflexes", "Crossing", "Agility", "Dribbling", "SprintSpeed"], outputCol="features") df = assembler.transform(df) # Dividir nuestro dataset (training_df, test_df) = df.randomSplit([0.7, 0.3]) # Entrenamiento entrenador = DecisionTreeClassifier( labelCol="Position", featuresCol="features") # Creacion de pipeline pipeline = Pipeline(stages=[entrenador]) # Se entrena el modelo model = pipeline.fit(training_df) # Prediccion predictions_df = model.transform(test_df) # Evaluador --> Accuracy evaluator = MulticlassClassificationEvaluator( labelCol="Position", predictionCol="prediction", metricName="accuracy") # Exactitud exactitud = evaluator.evaluate(predictions_df) print("Exactitud: {}".format(exactitud))
def get_model(model_string='LogisticRegression'): """ Get the desired model object for training and classification Args: Returns: model object from pyspark.ml.classification """ models_dict = { 'LogisticRegression': LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'RandomForestClassifier': RandomForestClassifier(numTrees=10), # Deep Learning note: the number on neurons in the last layer needs to equal # the number of categories. The number of neurons in the first layer # needs to be equal to the vocabulary of count vectorizer 'MultilayerPerceptronClassifier': MultilayerPerceptronClassifier(tol=1e-3, maxIter=10000, layers=[500, 100, 20, 6], blockSize=128, seed=1234), 'NaiveBayes': NaiveBayes() } return models_dict[model_string]
def get_multi_classification_pipeline(): transformer = AddHasText() stringIndexer = StringIndexer( inputCol='subbreddit_display_name', outputCol='label' ) assembler = VectorAssembler( inputCols=[ "post_title_embedding", "comments_number", "nsfw", "spoiler", "up_votes_number", "has_text" ], outputCol="features" ) dt = DecisionTreeClassifier( labelCol='label', featuresCol='features' ) return Pipeline( stages=[ transformer, stringIndexer, assembler, dt ] )
def DecisionTree(): IrisData = spark.sparkContext.textFile("file:///home/unbroken/MyFiles/Work/Programming/Spark/DecisionTree/Iris.txt")\ .map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() IrisData.createOrReplaceTempView("iris") df = spark.sql("select * from iris") labelIndexer = StringIndexer(inputCol='label', outputCol='labelIndex').fit(IrisData) featureIndexer = VectorIndexer( inputCol='feature', outputCol='indexFeature').setMaxCategories(4).fit(IrisData) labelConverter = IndexToString(inputCol='prediction', outputCol='predictionLabel').setLabels( labelIndexer.labels) trainningData, testingData = IrisData.randomSplit([0.7, 0.3]) dtClassifier = DecisionTreeClassifier().setLabelCol( 'labelIndex').setFeaturesCol('indexFeature') pipelineClassifier = Pipeline().setStages( [labelIndexer, featureIndexer, dtClassifier, labelConverter]) modelClassifier = pipelineClassifier.fit(trainningData) prediction = modelClassifier.transform(testingData) print(prediction.show()) evaluator = MulticlassClassificationEvaluator().setLabelCol( 'labelIndex').setPredictionCol('prediction').setMetricName("accuracy") accuracy = evaluator.evaluate(prediction) print(accuracy) treeModelClassifier = modelClassifier.stages[2] print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))
def trainBinaryTreeModel(data, directory=""): tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") remover = StopWordsRemover().setInputCol("words").setOutputCol( "filtered").setCaseSensitive(False) hashingTF = HashingTF().setNumFeatures(1000).setInputCol( "filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol( "features").setMinDocFreq(0) dt = DecisionTreeClassifier(labelCol="label", maxDepth=30, featuresCol="features") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, dt]) paramGrid = ParamGridBuilder()\ .addGrid(dt.maxDepth, [2, 5, 10, 20, 30]) \ .addGrid(dt.maxBins, [10, 50, 80]) \ .build() crossval = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName( 'areaUnderPR' ), # set area Under precision-recall curve as the evaluation metric # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) cvModel = crossval.fit(data) modelName = directory + "BinaryTreeModel" cvModel.bestModel.write().overwrite().save(modelName) return modelName
def trainAndEvalModelByDecisionTreeClassifier(stages, train_df, test_df, evaluator): ''' 使用 DecisionTreeClassifier 决策树分类建立机器学习Pipeline流程进行模型训练和评估 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' print( '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练 =======================' ) dt = DecisionTreeClassifier(labelCol='label', featuresCol='features', maxDepth=5, maxBins=20) dtPipeline = Pipeline(stages=stages + [dt]) # print(str(dtPipeline.getStages())) dtPipelineModel = dtPipeline.fit(train_df) bestModel = dtPipelineModel.stages[1] # print(bestModel.toDebugString) print( '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 =======================' ) predicts = dtPipelineModel.transform(test_df) # print(str(predicts.columns)) # 预测后新增的字段:'rawPrediction', 'probability', 'prediction' # predicts.select('probability', 'prediction').show(10) accuracy = evaluator.evaluate(predicts) print( '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(accuracy=' + str(accuracy) + ') =======================') return (bestModel, predicts, accuracy)
def main(self, sc, *args): from pyspark.sql.session import SparkSession from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.classification import DecisionTreeClassifier # Initialisiere den SQLContext sql = SparkSession.builder\ .enableHiveSupport() \ .config("hive.exec.dynamic.partition", "true") \ .config("hive.exec.dynamic.partition.mode", "nonstrict") \ .config("hive.exec.max.dynamic.partitions", "4096") \ .getOrCreate() # Lade die bereinigten Daten df = sql.read.format("com.databricks.spark.csv") \ .option("header", "true") \ .option("delimiter", ";") \ .load(self.input().path) # Den Klassifikator trainieren labeled = df.withColumn("label", df.subreddit.like("datascience").cast("double")) train_set, test_set = labeled.randomSplit([0.8, 0.2]) tokenizer = Tokenizer().setInputCol("cleaned_words").setOutputCol("tokenized") hashing = HashingTF().setNumFeatures(1000).setInputCol("tokenized").setOutputCol("features") decision_tree = DecisionTreeClassifier() pipeline = Pipeline(stages=[tokenizer, hashing, decision_tree]) model = pipeline.fit(train_set) model.save(self.output().path)
def evaluateDecisionTree(trainDF ,testDF): """ Traning by decision tree classifiers with some maxDepth params it returns bechmarkdata list for maxDepth params """ benchmarkData = [] for maxDepth in [10.0,15.0]: classifier = DecisionTreeClassifier(maxDepth=maxDepth) model = classifier.fit(trainDF) predictions = model.transform(testDF) print("Decision Tree evaluation with maxtDepth : {}".format(maxDepth)) accuracy = printevaluatation(model, predictions) benchmarkData += [("DT", "maxDepth" ,maxDepth ,float(accuracy))] return benchmarkData
def DTree_with_maxFeatures_maxDepth_fixed(data, max_depth, max_features): gt0 = time() crossed['DTree:depth' + repr(max_depth) + ':features' + repr(max_features)] = [] classifier = DecisionTreeClassifier(maxDepth=md, maxBins=mf, impurity='gini', maxMemoryInMB=1024) model = classifier.fit(data['scaled_train_df']) predictions = model.transform(data['scaled_cv_df']) evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') metric = evaluator.evaluate(predictions) crossed['DTree:depth' + repr(max_depth) + ':features' + repr(max_features)].append([metric, time() - gt0]) return crossed
def decision_tree_classifier(trainingDataFrame, maxCategories=4, maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", seed=None): labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel"). \ setHandleInvalid("keep").fit(trainingDataFrame) featureIndexer = VectorIndexer( inputCol="features", outputCol="indexedFeatures", maxCategories=maxCategories).fit(trainingDataFrame) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, minInfoGain=minInfoGain, maxMemoryInMB=maxMemoryInMB, cacheNodeIds=cacheNodeIds, checkpointInterval=checkpointInterval, impurity=impurity, seed=seed) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) dtModel = pipeline.fit(trainingDataFrame) result = {} result["model"] = dtModel result["summary"] = dtModel.stages[2] return result
def entrenamiento(df): # Vectorizo df = df.select("EDAD", "GENERO", "ETNIA", "GLICEMIA", "PERIMETRO_ABDOMINAL", "RCV_GLOBAL", "IMC", "DIABETES") assembler = VectorAssembler(inputCols=[ "EDAD", "GENERO", "ETNIA", "GLICEMIA", "PERIMETRO_ABDOMINAL", "RCV_GLOBAL", "IMC" ], outputCol="features") df = assembler.transform(df) # Dividir dataset (training_df, test_df, validation_df) = df.randomSplit([0.7, 0.2, 0.1]) # Entrenamiento entrenador = DecisionTreeClassifier(labelCol="DIABETES", featuresCol="features") # Creacion de pipeline pipeline = Pipeline(stages=[entrenador]) # Se entrena el modelo model = pipeline.fit(training_df) # Prediccion predictions_df = model.transform(test_df) predictions_df = model.transform(validation_df) # Evaluador --> Accuracy evaluator = MulticlassClassificationEvaluator(labelCol="DIABETES", predictionCol="prediction", metricName="accuracy") # Exactitud exactitud = evaluator.evaluate(predictions_df) print("Exactitud: {}".format(exactitud))
def training(df): # 0. load the cleanning data df_cleanning = df.select("id").distinct() # Split the data into training and test sets (30% held out for testing) (df_training, df_test) = df_cleanning.randomSplit([0.7, 0.3]) # 1. load the training data # 准备训练集合 df_result = df df_result = df_result.select("id", "label", "features") labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df_result) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=6).fit(df_result) df_training.show(10) # 1.1 构建训练集合 df_training = df_training.join(df_result, how="left", on="id") df_training.show() print(df_training.count()) # 1.2 构建测试集合 df_test = df_test.join(df_result, how="left", on="id") df_test.show() print(df_test.count()) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(df_training) # Make predictions. df_predictions = model.transform(df_test) # Select example rows to display. df_predictions.show(10) df_predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(df_predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] # summary only print(treeModel) model.write().overwrite().save( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/pfizer_model/0.0.4/model_without_prod" ) print(treeModel.toDebugString) return treeModel
def test_decisiontree_classifier(self): dt = DecisionTreeClassifier(maxDepth=1) path = tempfile.mkdtemp() dtc_path = path + "/dtc" dt.save(dtc_path) dt2 = DecisionTreeClassifier.load(dtc_path) self.assertEqual(dt2.uid, dt2.maxDepth.parent, "Loaded DecisionTreeClassifier instance uid (%s) " "did not match Param's uid (%s)" % (dt2.uid, dt2.maxDepth.parent)) self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], "Loaded DecisionTreeClassifier instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def train_model(self, train_df, assembler): dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=self.RANDOM_SEED) pipeline = Pipeline(stages=[assembler, dt]) model = pipeline.fit(train_df) return model
def trainModel(self, sentimentInfoData): label = sentimentInfoData.get(pc.INDEXEDCOLM) feature = sentimentInfoData.get(pc.FEATURECOLUMN) dataset = sentimentInfoData.get(pc.DATASET) '''temp split the dataset to training and testing dataset''' (trainDataset, testDataset) = dataset.randomSplit([0.7,0.3]) decisionTreeClassifier = DecisionTreeClassifier(labelCol= label, featuresCol=feature) decisionModel = decisionTreeClassifier.fit(trainDataset) # decisionModel.transform(trainDataset).groupBy("sentiment").count().show() predictionDataset = decisionModel.transform(testDataset) #calculating the accuracy of the model evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictionDataset) print("Test Error = %g " % (1.0 - accuracy)) '''gbt = GBTClassifier(labelCol= label, featuresCol= feature).fit(trainDataset)
def main(spark,filename): df = spark.read.csv(filename,header=False,inferSchema=True) vector_assembler = VectorAssembler(inputCols=['_c0','_c1','_c2','_c3'],outputCol='features') # df.show(4) # +---+---+---+---+-----------+ # |_c0|_c1|_c2|_c3| _c4| # +---+---+---+---+-----------+ # |5.1|3.5|1.4|0.2|Iris-setosa| # |4.9|3.0|1.4|0.2|Iris-setosa| # |4.7|3.2|1.3|0.2|Iris-setosa| # |4.6|3.1|1.5|0.2|Iris-setosa| # +---+---+---+---+-----------+ vector_assembler = VectorAssembler(inputCols=['_c0','_c1','_c2','_c3'],outputCol='features') v_df = vector_assembler.transform(df) # v_df.show(4) # +---+---+---+---+-----------+-----------------+ # |_c0|_c1|_c2|_c3| _c4| features| # +---+---+---+---+-----------+-----------------+ # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]| # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]| # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]| # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]| # +---+---+---+---+-----------+-----------------+ # only showing top 4 rows indexer = StringIndexer(inputCol='_c4',outputCol='label') i_df = indexer.fit(v_df).transform(v_df) # i_df.show(4) # +---+---+---+---+-----------+-----------------+-----+ # |_c0|_c1|_c2|_c3| _c4| features|label| # +---+---+---+---+-----------+-----------------+-----+ # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]| 0.0| # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]| 0.0| # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]| 0.0| # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]| 0.0| # +---+---+---+---+-----------+-----------------+-----+ # only showing top 4 rows splits = i_df.randomSplit([0.6,0.4],1) train_df = splits[0] test_df = splits[1] dt = DecisionTreeClassifier(labelCol= 'label',featuresCol='features') dt_model = dt.fit(train_df) dt_pred = dt_model.transform(test_df) dt_evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy') dt_accuracy = dt_evaluator.evaluate(dt_pred) print(dt_accuracy)
def test_decisiontree_classifier(self): dt = DecisionTreeClassifier(maxDepth=1) path = tempfile.mkdtemp() dtc_path = path + "/dtc" dt.save(dtc_path) dt2 = DecisionTreeClassifier.load(dtc_path) self.assertEqual(dt2.uid, dt2.maxDepth.parent, "Loaded DecisionTreeClassifier instance uid (%s) " "did not match Param's uid (%s)" % (dt2.uid, dt2.maxDepth.parent)) self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], "Loaded DecisionTreeClassifier instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def train(data, max_depth, max_bins): print("Parameters: max_depth: {} max_bins: {}".format( max_depth, max_bins)) # spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate() # Load the data stored in LIBSVM format as a DataFrame. # data = spark.read.format("libsvm").load(os.environ['DSX_PROJECT_DIR']+data_path) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_bins", max_bins) # Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxDepth=max_depth, maxBins=max_bins) # Chain indexers and tree in a Pipeline. pipeline = Pipeline(stages=[label_indexer, feature_indexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error. evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) test_error = 1.0 - accuracy print("Test Error = {} ".format(test_error)) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("test_error", test_error) tree_model = model.stages[2] print(tree_model) mlflow.spark.log_model(model, '') spark.stop()
def pipeline(self, modeling_code, classifiers_metadata): spark_session = ( SparkSession .builder .appName("modelBuilder") .config("spark.driver.port", os.environ[SPARK_DRIVER_PORT]) .config("spark.driver.host", os.environ[MODEL_BUILDER_HOST_NAME]) .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.2", ) .config("spark.scheduler.mode", "FAIR") .config("spark.scheduler.pool", "modelBuilder") .config("spark.scheduler.allocation.file", "./fairscheduler.xml") .master("spark://" + os.environ[SPARKMASTER_HOST] + ":" + str(os.environ[SPARKMASTER_PORT]) ) .getOrCreate() ) (features_training, features_testing, features_evaluation) = \ self.modeling_code_processing( modeling_code, spark_session) classifier_switcher = { "LR": LogisticRegression(), "DT": DecisionTreeClassifier(), "RF": RandomForestClassifier(), "GB": GBTClassifier(), "NB": NaiveBayes(), } classifier_threads = [] for name, metadata in classifiers_metadata.items(): classifier = classifier_switcher[name] classifier_threads.append( self.thread_pool.submit( Model.classifier_processing, classifier, features_training, features_testing, features_evaluation, metadata, ) ) for classifier in classifier_threads: testing_prediction, metadata_document = classifier.result() self.save_classifier_result( testing_prediction, metadata_document ) spark_session.stop()
def decisionTree(training_data, test_data): tree_classifier = DecisionTreeClassifier( featuresCol= "features", # datovy stlpec obsahujuci vektor vstupnych atributov labelCol= "Casualty_Severity_Index", # datovy stlpec obsahujuci cielovy atribut (indexy tried) impurity= "entropy", # pre vyber atributov pri deleni sa pouzije kriterium informacneho zisku maxDepth=5) # ohranicime maximalnu hlbku generovaneho stromu tree_model = tree_classifier.fit(training_data) predictions = tree_model.transform(test_data) test_error = predictions.filter( predictions["prediction"] != predictions["Casualty_Severity_Index"] ).count() / float(test_data.count()) print "Testing error: {0:.4f}".format(test_error) return predictions
def decision_tree_generator(training_data,deal_id): ####In: #A training data set #The deal_id you want to generate a decision tree for ####Out #The tree is saved #An update message is outputted training_data = training_data.withColumnRenamed(deal_id,'label') dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=8,impurity="entropy", algo="classification",numClasses=2) model = dt.fit(training_data) model.write().overwrite().save(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/") output_message = "Saved a Decision Tree for "+deal_id+"." return model
def decisionTreeClassifier(trainingData, testData, ncolumns, schemaNames): from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.tuning import CrossValidator from pyspark.ml.evaluation import MulticlassClassificationEvaluator import numpy as np from pyspark.ml.evaluation import BinaryClassificationEvaluator import time dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=15, maxBins=15, impurity='entropy') timer = '' start = time.time() cvModelDT = dt.fit(trainingData) end = time.time() timer = ((end - start) / 60) prediction = cvModelDT.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") areaUC = evaluator.evaluate(prediction) fi = cvModelDT.featureImportances imp_feat = np.zeros(ncolumns - 1) imp_feat[fi.indices] = fi.values x = np.arange(ncolumns - 1) idx = (-imp_feat).argsort()[:3] feat = [] for i in idx: feat.append(schemaNames[i]) return feat, accuracy, areaUC, timer
def compute_decision_tree(self): dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") stages = [self.featurizer, dt] paramGrid = ParamGridBuilder() \ .baseOn([self.train_pipeline.stages, stages]) \ .addGrid(self.featurizer.modelName, self.featurizers) \ .build() return paramGrid
def get_model(classifier, params): """ TODO: Add support for params in pyspark ML :param classifier: :param params: :return: """ if classifier == 'Decision Tree': return DecisionTreeClassifier(labelCol="cardio", featuresCol="features") return RandomForestClassifier(labelCol="cardio", featuresCol="features")
def testWorkflow(self): df = self.sqlContext.read.csv(irisCsvFile, header = True, inferSchema = True) formula = RFormula(formula = "Species ~ .") classifier = DecisionTreeClassifier() pipeline = Pipeline(stages = [formula, classifier]) pipelineModel = pipeline.fit(df) pmmlBytes = toPMMLBytes(self.sc, df, pipelineModel) pmmlString = pmmlBytes.decode("UTF-8") self.assertTrue(pmmlString.find("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">") > -1)
def __init__(self, data): tokenizer = Tokenizer(inputCol="text", outputCol="words") vectorizer = CountVectorizer(inputCol="words", outputCol="rawFeatures") idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features") dt = DecisionTreeClassifier(maxDepth=30, maxBins=128, minInstancesPerNode=5, maxMemoryInMB=4096) pipeline = Pipeline(stages=[tokenizer, vectorizer, idf, dt]) self.model = pipeline.fit(data)
def retrain_full_model(data, model_type, paramMap): ''' This function takes the whole dataset and retrains the given model with best parameters. Arguments: data {PySpark Dataframe} -- A PySpark Dataframe containing feature vectors and labels paramMap {dict} -- A dictionary of the best parameter values model_type {str} -- The type of model to train Returns: model -- Returns the model retrained on full dataset ''' if model_type == 'logistic': lr = LogisticRegression() model = lr.fit(data, paramMap) elif model_type == 'decisiontree': dt = DecisionTreeClassifier() model = dt.fit(data, paramMap) return model
def classify_target(): """Forecast binary target.""" df = sql.read.parquet(str(DATA_PARQUET)) features = ['cost', 'call_duration_minutes', 'data_volume_mb'] variables = features + ['test_flag', 'target'] pipeline_prepare = Pipeline(stages=[ VectorAssembler(inputCols=features, outputCol='features'), ]) prepared = pipeline_prepare.fit(df).transform(df.dropna(subset=variables)) training = prepared.filter(col('test_flag') == 0) testing = prepared.filter(col('test_flag') == 1) training_small = training.sample(fraction=0.3, seed=100500) evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='target') breakpoint() # Logistic regression classifier = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='features', labelCol='target', predictionCol='prediction', probabilityCol='probability') model = classifier.fit(training_small) predicted = model.transform(testing) print('Test Area Under ROC: ', evaluator.evaluate(predicted)) breakpoint() # Decision Tree Classifier classifier = DecisionTreeClassifier(featuresCol='features', labelCol='target', maxDepth=3) model = classifier.fit(training_small) predicted = model.transform(testing) print('Test Area Under ROC: ', evaluator.evaluate(predicted)) breakpoint() # Random Forest Classifier rf = RandomForestClassifier(featuresCol='features', labelCol='label') model = classifier.fit(training_small) predicted = model.transform(testing) print('Test Area Under ROC: ', evaluator.evaluate(predicted)) breakpoint()
def __init__(self, classifier_class, num_classes=None, numerical_features_index=None, nominal_features_index=None, fine_nominal_features_index=None, classifier_opts=None, epochs_number=None, level=None, fold=None, classify=None, workers_number=None, arbitrary_discr='', weight_features=True): self.spark_session = SingletonSparkSession.get_session() self.scale = False self.probas_ = None self.is_keras = False self.workers_number = workers_number self.epochs_number = epochs_number if classifier_class == 'drf': self._classifier = RandomForestClassifier(featuresCol='features', labelCol='categorical_label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=20, maxBins=128, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=1024, cacheNodeIds=False, checkpointInterval=10, impurity='gini', numTrees=100, featureSubsetStrategy='sqrt', seed=None, subsamplingRate=1.0) elif classifier_class == 'dnb': self._classifier = NaiveBayes(featuresCol='scaled_features', labelCol='categorical_label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None, weightCol=None) self.scale = True elif classifier_class == 'dgb': self._classifier = GBTClassifier(featuresCol='features', labelCol='categorical_label', predictionCol='prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, featureSubsetStrategy='all') elif classifier_class == 'ddt': self._classifier = DecisionTreeClassifier(featuresCol='features', labelCol='categorical_label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity='gini', seed=None) elif classifier_class.startswith('dk'): depth = classifier_opts[0] self.keras_wrapper = SklearnKerasWrapper(*classifier_opts, model_class=classifier_class[1:], epochs_number=epochs_number, num_classes=num_classes, nominal_features_index=[], fine_nominal_features_index=[], numerical_features_index=numerical_features_index + fine_nominal_features_index + nominal_features_index, level=level, fold=fold, classify=classify, weight_features=weight_features, arbitrary_discr=arbitrary_discr) self._classifier = self.keras_wrapper.init_model()[2] self.nominal_features_index = nominal_features_index self.is_keras = True self.model_ = None
display(selected) # COMMAND ---------- # MAGIC %md # MAGIC ####Decision Trees # MAGIC You can read more about Decision Trees from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-decision-tree.html). # MAGIC # MAGIC Decision Trees is a popular algorithm as it can handle categorical data and work with multiclass data. # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier # Create initial Decision Tree Model dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3) # Train model with Training Data dtModel = dt.fit(trainingData) # COMMAND ---------- # MAGIC %md We can extract the number of nodes in our decision tree as well as the tree depth of our model. # COMMAND ---------- print "numNodes = ", dtModel.numNodes print "depth = ", dtModel.depth # COMMAND ----------
def main(): root = os.path.dirname(os.path.abspath(__file__)) print("Digits Handwriting Recognition using Spark") print("Root file path is = %s" %root) conf = SparkConf().setAppName("OCR") sc = SparkContext(conf = conf) sc.setLogLevel("WARN") sqlContext = SQLContext(sc) print("loading dataset") trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist") testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t") # check if rdd support toDF if not hasattr(trainRDD, "toDF"): print("ERROR: RDD does not support toDF") os.exit(1) ## convert RDDs to data frames trainDF = trainRDD.toDF() testDF = testRDD.toDF() print("INFO: train dataframe count = %u" %trainDF.count()) print("INFO: test dataframe count = %u" %testDF.count()) indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") dtc = DecisionTreeClassifier(labelCol="indexedLabel") pipeline = Pipeline(stages=[indexer, dtc]) model = pipeline.fit(trainDF) ## train multiple depth models variedMaxDepthModels = [] print("Create varied depth CNN models [1..8]") for mdepth in xrange(1, 9): start = time.time() ## maximum depth dtc.setMaxDepth(mdepth) ## create pipeline pipeline = Pipeline(stages = [indexer, dtc]) ## create the model model = pipeline.fit(trainDF) ## add to varied container variedMaxDepthModels.append(model) end = time.time() print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start)) print("=================================================") ## report model accuraries evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision") ## mdepth print("Evaluate all models precision") for mdepth in xrange(1, 9): model = variedMaxDepthModels[mdepth - 1] predictions = model.transform(testDF) precision = evaluator.evaluate(predictions) print("CNN depth = %u, precision = %.3f" %(mdepth, precision)) print("Finished processing %u digits" %testDF.count())
#String Indexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(dfUSD) td = si_model.transform(dfUSD) td.collect() td.show() #Splitting data (trainingData, testData) = td.randomSplit([0.6, 0.4]) trainingData.count() testData.count() testData.collect() #Creating decision tree model dtClassifer = DecisionTreeClassifier(labelCol="indexed",minInstancesPerNode=1500) dtModel = dtClassifer.fit(trainingData) dtModel.numNodes dtModel.depth #Predict on the test data predictions = dtModel.transform(trainingData) predictions = dtModel.transform(testData) predictions.select("prediction","indexed","label","features").show(10) #Evaluation evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="indexed",metricName="precision") evaluator.evaluate(predictions) #Draw a confusion matrix
#section 8.2.6 # OneVsRest is not available in Python. #section 8.3.1 from pyspark.ml.feature import StringIndexer dtsi = StringIndexer(inputCol="label", outputCol="label-ind") dtsm = dtsi.fit(penlpoints) pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed("label-ind", "label") pendtsets = pendtlpoints.randomSplit([0.8, 0.2]) pendttrain = pendtsets[0].cache() pendtvalid = pendtsets[1].cache() from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(maxDepth=20) dtmodel = dt.fit(pendttrain) # rootNode is not accessible in Python dtpredicts = dtmodel.transform(pendtvalid) dtresrdd = dtpredicts.select("prediction", "label").map(lambda row: (row.prediction, row.label)) from pyspark.mllib.evaluation import MulticlassMetrics dtmm = MulticlassMetrics(dtresrdd) dtmm.precision() #0.951442968392121 print(dtmm.confusionMatrix()) #DenseMatrix([[ 205., 0., 3., 0., 0., 3., 1., 0., 0., # 0.], # [ 0., 213., 0., 1., 2., 1., 0., 2., 0.,
# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show() #********************************************************************** #-----------Training the model for prediction-------------------------- #********************************************************************** from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),labelCol=string_indexer.getOutputCol()) dt_model = dt.fit(dfTrainFinal) # On applique le même à notre ensemble de test ridicule. # En théorie le pipeline permet d'automatiser tout ça mais bon, on s'en servira probablement pas # EDIT : en fait c'est plutot facile de créer des transformers à partir de chaque étape, donc peut # être que les pipelines c'est faisables. A voir df_test_words = tokenizer.transform(dfTest) df_test_tf = htf.transform(df_test_words) df_test_tfidf = idfModel.transform(df_test_tf) df_test_final = string_indexer_model.transform(df_test_tfidf) # Les prédictions df_test_pred = dt_model.transform(df_test_final)