def tree_model() -> (DecisionTreeClassificationModel): SparkSession.builder \ .master("local[2]") \ .appName("dtreeviz_sparkml") \ .getOrCreate() spark_major_version = int(pyspark.__version__.split(".")[0]) if spark_major_version >= 3: return DecisionTreeClassificationModel.load( "fixtures/spark_3_0_decision_tree_classifier.model") elif spark_major_version >= 2: return DecisionTreeClassificationModel.load( "fixtures/spark_2_decision_tree_classifier.model")
def test(spark): sc = spark.sparkContext tokenizer = Tokenizer(inputCol="sentence", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8000) idf = IDF(inputCol="rawFeatures", outputCol="features") srcdf = sc.textFile('predict.csv').map(parse_line) testing = srcdf.toDF() model = DecisionTreeClassificationModel.load('Bayes20000') testWordsData = tokenizer.transform(testing) testFeaturizedData = hashingTF.transform(testWordsData) testIDFModel = idf.fit(testFeaturizedData) testRescaledData = testIDFModel.transform(testFeaturizedData) testRescaledData.persist() testDF = testRescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() predictions = model.transform(testDF) predictions.select('prediction').write.csv(path='submit', header=True, sep=',', mode='overwrite') evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The accuracy on test-set is " + str(accuracy))
def tree_model() -> (DecisionTreeClassificationModel): SparkSession.builder \ .master("local[2]") \ .appName("dtreeviz_sparkml") \ .getOrCreate() return DecisionTreeClassificationModel.load( "fixtures/spark_decision_tree_classifier.model")
def read_model(self): if "LogisticRegression" in self.best_model_path: classifier = LogisticRegressionModel.load(self.best_model_path) elif "DecisionTree" in self.best_model_path: classifier = DecisionTreeClassificationModel.load( self.best_model_path) elif "RandomForest" in self.best_model_path: classifier = RandomForestClassificationModel.load( self.best_model_path) elif "LinearSVC" in self.best_model_path: classifier = LinearSVCModel.load(self.best_model_path) if "VGG16" in self.best_model_path: featurizer_name = "VGG16" elif "VGG19" in self.best_model_path: featurizer_name = "VGG19" elif "InceptionV3" in self.best_model_path: featurizer_name = "InceptionV3" elif "Xception" in self.best_model_path: featurizer_name = "Xception" elif "ResNet50" in self.best_model_path: featurizer_name = "ResNet50" return featurizer_name, classifier
def handler(message): records = message.collect() for record in records: print('record', record, type(record)) print('-----------') print('tuple', record[0], record[1], type(record[0]), type(record[1])) # producer.send(output_topic, b'message received') key = record[0] value = record[1] if len(key) > 10: image_path = value image_DF = dl.readImages(image_path) image_DF.show() tested_lr_test = p_lr_test.transform(image_DF) # tested_lr_test.show() predict_value = tested_lr_test.select('prediction').head()[0] - 1 print('predict', predict_value) print('byte predict', str(predict_value).encode('utf-8')) print('byte key', str(key).encode('utf-8')) producer.send(output_topic, key=str(key).encode('utf-8'), value=str(predict_value).encode('utf-8')) producer.flush() print('predict over') elif len(key) == 10: print('entered csv model part') modelloaded = DecisionTreeClassificationModel.load( "hdfs:///treemodelofcsv") NewInput = Row('Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt') value_lst = str(value).split(',') print('value_lst', value_lst) print('lst_len', len(value_lst)) new_input = NewInput(int(value_lst[0]), int(value_lst[1]), int(value_lst[2]), int(value_lst[3]), int(value_lst[4]), int(value_lst[5]), int(value_lst[6]), int(value_lst[7]), int(value_lst[8]), int(value_lst[9]), int(value_lst[10]), int(value_lst[11]), int(value_lst[12]), int(value_lst[13]), int(value_lst[14]), int(value_lst[15]), int(value_lst[16]), value_lst[17]) df_new_input = sql_sc.createDataFrame([new_input]) df_new_input.show() df_new_input = pipeline.fit(df_new_input).transform(df_new_input) df_new_input = feature.transform(df_new_input) new_predict = modelloaded.transform(df_new_input) new_predict.show() predict_value = str(new_predict.select('prediction').head()[0]) print('predict value', predict_value.encode('utf-8')) producer.send(output_topic, key=str(key).encode('utf-8'), value=predict_value.encode('utf-8')) producer.flush()
def DecisionTree(data): path = 'modelo_DecisionTree/modelDecisionTree' DecisionTree = DecisionTreeClassificationModel.load(path) predictions = DecisionTree.transform(data) prediccion = predictions.select( 'prediction', 'probability').rdd.flatMap(lambda x: x).collect() print(prediccion[0]) if prediccion[0] == 1.0: prediccionLabel = 'FALSO' else: prediccionLabel = 'VERDADERO' return prediccionLabel, prediccion[1][0] * 100
def decision_tree_classifier(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed") model = dt.fit(td) # model.numNodes # # 3 # model.depth # # 1 # model.featureImportances # # SparseVector(1, {0: 1.0}) # model.numFeatures # # 1 # model.numClasses # # 2 print(model.toDebugString) # DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) result = model.transform(test0).head() # result.prediction # # 0.0 # result.probability # # DenseVector([1.0, 0.0]) # result.rawPrediction # # DenseVector([1.0, 0.0]) test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) # model.transform(test1).head().prediction # # 1.0 temp_path = "." dtc_path = temp_path + "/dtc" dt.save(dtc_path) dt2 = DecisionTreeClassifier.load(dtc_path) # dt2.getMaxDepth() # # 2 model_path = temp_path + "/dtc_model" model.save(model_path) model2 = DecisionTreeClassificationModel.load(model_path)
def detect(log_entries_df, model_path='/shared/models/sentiment/overall/optimized', additional_cols=True, incl_idf=False, reduced_feature_set=True): # load the pre-trained 'Decision Tree Classifier' model = DecisionTreeClassificationModel.load(model_path) # preprocess the raw log statements commin from kafka preprocessed_df = preprocess(log_entries_df, additional_cols, reduced_feature_set) # apply the Feature Vectorization to these Pre-Processed log entries (Tokens) features_df = vectorize(preprocessed_df, incl_idf) # apply the Trained Decision Tree Classifier to the Pre-Processed Feature Vectors to predict and classify sentiment if additional_cols: predictions_df = model.transform(features_df).select( "text", "prediction", cfg.key_col_name, cfg.value_col_name) # .filter("prediction == 1.0") \ # anomalies_df = predictions_df \ # .select(concat(lit('{"prediction":"'), when(col("prediction") == 1.0, lit("anomaly")).otherwise(lit("normal")), # lit('","classifier":"decision_tree","uuid":"'), col(cfg.key_col_name), lit('","log_stmt":"'), col(cfg.value_col_name), lit('"}')).alias("value")) predictions_df = predictions_df.withColumn('classifier', lit('sentiment')) anomalies_df = predictions_df.select( when(col("prediction") == 1.0, lit("anomaly")).otherwise(lit("normal")).alias("prediction"), 'classifier', cfg.key_col_name, cfg.value_col_name) else: predictions_df = model.transform(features_df).select( "text", "prediction") # .filter("prediction == 1.0") \ anomalies_df = predictions_df.select( concat( lit('{"prediction":"'), when(col("prediction") == 1.0, lit("anomaly")).otherwise(lit("normal")), lit('","text":"'), col('text'), lit('"}')).alias("value")) return anomalies_df
def prediction(self, infoData): isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get( pc.ISNGRAM) predictionColm = infoData.get(pc.PREDICTIONCOLM) algoName = infoData.get(pc.ALGORITHMNAME) modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION) spark = infoData.get(pc.SPARK) datasetPath = infoData.get(pc.SENTIMENTDATASETPATH) originalDataset = spark.read.parquet(datasetPath) originalDataset = pu.addInternalId(originalDataset) infoData.update({pc.DATASET: originalDataset}) infoData = self.dataTransformation(infoData) dataset = infoData.get(pc.DATASET) if (isNgram): """sahil-- handle the none value for ngram parameter at the time of data creation""" textProcessing = TextProcessing() ngramPara = infoData.get(pc.NGRAMPARA) dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED, ngramPara) """ -- sahil- hardCoding the algorithm name for comparision handle this while finalising """ if ("GradientBoostClassifier".__eq__(algoName)): predictionModel = GBTClassificationModel.load(modelStorageLocation) if ("DecisionTreeClassifier".__eq__(algoName)): predictionModel = DecisionTreeClassificationModel.load( modelStorageLocation) dataset = dataset.drop(predictionColm) originalDataset = originalDataset.drop(predictionColm) dataset = predictionModel.transform(dataset) """calling indexToString method after the prediction""" infoData.update({pc.DATASET: dataset}) infoData = self.invertIndex(infoData) dataset = infoData.get(pc.DATASET) dataset = dataset.select(pc.DMXINDEX, predictionColm) finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX) return finalDataset
def decision_tree_evaluator(test_data,deal_id): ####In: #A testing data set #The deal_id you want to test a tree for #NB: The model tree to be already saved to the cloud ####Out #An update message is outputted #an evaluator model = DecisionTreeClassificationModel.load(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/") predictions = model.transform(test_data.withColumnRenamed(deal_id,'label')) # compute accuracy on the test set evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR") #alternatively, use areaUnderPR to get the precision-recall curve instead of the accuracy accuracy = evaluator.evaluate(predictions) print("Decision Tree area under PR " + deal_id + " = " + str(accuracy)) return evaluator
def get_metrics(deal_id, test_data=market_test): ####In: #A testing data set, as generated by data_prep() #The deal_id you want to test a model for ####Out #The two sets of accuracies and are unders the PR curves are outputted #import models model_lr = LogisticRegressionModel.load( f"/mnt/lotte/logistic_regression/{deal_id}/") model_trees = DecisionTreeClassificationModel.load( f"/mnt/lotte/decision_trees/{deal_id}/") #fit models predictions_lr = model_lr.transform( test_data.withColumnRenamed(deal_id, 'label')) predictions_trees = model_trees.transform( test_data.withColumnRenamed(deal_id, 'label')) #define evaluators evaluator_accuracy = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") evaluator_area = BinaryClassificationEvaluator( labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR") #get metrics lr_accuracy = evaluator_accuracy.evaluate(predictions_lr) lr_area = evaluator_area.evaluate(predictions_lr) trees_accuracy = evaluator_accuracy.evaluate(predictions_trees) trees_area = evaluator_area.evaluate(predictions_trees) #gather metrics metrics = [lr_accuracy, lr_area, trees_accuracy, trees_area] return metrics
def apply_decision_tree_classifier(tweets): model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \ 'Sentiment_Analysis_-_Decision_tree.0000' model = DecisionTreeClassificationModel.load(model_path) return model.transform(tweets)
training2_1 = rf_model1.transform(training) PredictionsandLabels = training2_1.select('prediction','Survived').rdd PredictionsandLabels.collect() # 2 learning process - created a model model22 = dt1.fit(training) model22.depth model22.numFeatures model22.save('E:/kaggle/model22') model120 = DecisionTreeClassificationModel() model122 = model120.load('E:/kaggle/model22') training4 = model122.transform(training) training4.show(3) model23 = training2 = model22.transform(training) PredictionsandLabels = training2.select('prediction','Survived').rdd PredictionsandLabels.collect() # -------------------------------------------------------------- #Resubstitution approach
[5, 4.91, 5.11], [0, 5.95, 8.17], [1, 6.41, 8.34], [4, 5.73, 5.93], [2, 6.39, 7.45], [3, 7.29, 6.5]]), columns=['Hour', 'true','simulation'])\ .astype({'Hour': 'int32'})\ .sort_values(by="Hour") # COMMAND ---------- ax = hour_prices[["true", "simulation"]].plot(use_index=False, xlim=(0, 23), ylim=(0, 10)) ax.legend() # COMMAND ---------- market_predictions\ .select("lr_max_bid","trees_max_bid","true_winning_max_bid")\ .describe()\ .show() # COMMAND ---------- deal_id = "e86f7061c" model_trees = DecisionTreeClassificationModel.load( f"/mnt/lotte/decision_trees/{deal_id}/") print(model_trees.toDebugString) #deal_id = "48f06d9af"
def load(): spark = createLocalSparkSession() obj = DecisionTreeClassificationModel.load('tmp')
def _get_root_node(tree: DecisionTreeClassificationModel): return tree._call_java('rootNode')
# (1) Import our Config file and our Pre-Processing and Feature Vectorisation pipeline functions import config import model_pipelines __author__ = "Jillur Quddus" __credits__ = ["Jillur Quddus"] __version__ = "1.0.0" _maintainer__ = "Jillur Quddus" __email__ = "*****@*****.**" __status__ = "Development" # (2) Create a Spark Session using the Spark Context instantiated from spark-submit spark = SparkSession.builder.appName("Stream Processing - Real-Time Sentiment Analysis").getOrCreate() # (3) Load the Trained Decision Tree Classifier that we trained and persisted in Chapter 06 decision_tree_model = DecisionTreeClassificationModel.load(config.trained_classification_model_path) # (4) Spark Structured Streaming does not yet support the automatic inference of JSON Kafka values into a Dataframe without a Schema # Therefore let us define the Schema explicitly schema = StructType([ StructField("created_at", StringType()), StructField("id", StringType()), StructField("id_str", StringType()), StructField("text", StringType()), StructField("retweet_count", StringType()), StructField("favorite_count", StringType()), StructField("favorited", StringType()), StructField("retweeted", StringType()), StructField("lang", StringType()), StructField("location", StringType()) ])
# COMMAND ---------- #maxDepth=10, maxBins=128,maxMemoryInMB=2048,seed=1 for i in range(5): decision_tree_generator(market_train,deal_ids[i]) decision_tree_evaluator(market_test,deal_ids[i]) #worse results than the defaults, probably because of overfitting # COMMAND ---------- #Creates and saves all trees models for deal_id in deal_ids: market_predictions = market_test.select("features","market_guid") model = DecisionTreeClassificationModel.load(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/") market_predictions = model.transform(market_predictions.withColumnRenamed(deal_id,'label')) market_predictions.groupBy("probability").count().show(100,False) # COMMAND ---------- print(deal_ids) # COMMAND ---------- #Prints the area under the Precision-Recall curve for every model for deal_id in deal_ids_list: print(logistic_regression_evaluator(test_data=market_test,deal_id=deal_id))
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in ["AdoptionSpeed"]] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) df_test = pipeline.fit(df_test).transform(df_test) feature = VectorAssembler(inputCols=input_cols, outputCol="features") feature_vector = feature.transform(df) feature_vector_test = feature.transform(df_test) (trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11) testData.printSchema() #testData.show(10) lr = DecisionTreeClassifier(labelCol="AdoptionSpeed_index", featuresCol="features") lrModel = lr.fit(trainingData) lrModel.write().overwrite().save("hdfs:///treemodelofcsv") modelloaded = DecisionTreeClassificationModel.load("hdfs:///treemodelofcsv") lr_prediction = modelloaded.transform(testData) # lr_prediction.select("prediction", "Survived", "features").show() # evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy") evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed_index", predictionCol="prediction", metricName="accuracy") lr_accuracy = evaluator.evaluate(lr_prediction) print("Accuracy of DecisionTreeModel is = %g" % (lr_accuracy)) print("Test Error of DecisionTreeModel = %g " % (1.0 - lr_accuracy)) # lr_prediction.show() lr_prediction = modelloaded.transform(feature_vector_test) predictions = [int(elem['prediction']) for elem in lr_prediction.select('prediction').collect()] predictions_ids = [elem['PetID'] for elem in lr_prediction.select('PetID').collect()] df_new = pd.DataFrame() df_new['PetID'] = predictions_ids df_new['AdoptionSpeed'] = predictions
from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() import numpy as np from flask import Flask, abort, jsonify, request from pyspark.ml.classification import DecisionTreeClassificationModel from pyspark.ml.linalg import Vectors model4 = DecisionTreeClassificationModel() model5 = model4.load('E:/kaggle/titanic/dt_model10') model5.depth model5.numFeatures app = Flask(__name__) @app.route('/api',methods=('GET','POST')) def make_predict(): print('hi, good morning ... ') data = request.get_json(force=True) print(data) predict_df = spark.createDataFrame([(1,Vectors.dense(data))],['index','Features']) predict_df.show() output = model5.transform(predict_df).select('prediction').first()[0] print(output) return jsonify('Survived' if output==1 else 'Not Survived') if __name__ == '__main__':
def load_model(self): if self.model_path.exists(): return DecisionTreeClassificationModel.load(str(self.model_path)) return self.train_model(self.load_data())
training2_1 = rf_model1.transform(training) training2_1.select('prediction','Survived').show() PredictionsandLabels = training2_1.select('prediction','Survived').rdd PredictionsandLabels.collect() model22 = dt1.fit(training) model22.depth model22.numFeatures model22.save('/users/jyothsnap/Kaggle/titanic/model22') model120 = DecisionTreeClassificationModel() model122 = model120.load('/users/jyothsnap/Kaggle/titanic/model22') training4 = model122.transform(training) training4.show(3) model23 = training2 = model22.transform(training) PredictionsandLabels = training2.select('prediction','Survived').rdd PredictionsandLabels.collect() # -------------------------------------------------------------- #Resubstitution approach
# Please run this program from anaconda prompt (command line) # python "Program path and name" # python "e:\studyml-lab\Machine-Learning\Deployment\model deploy and run service.py" from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() # import numpy as np from flask import Flask, jsonify, request from pyspark.ml.classification import DecisionTreeClassificationModel from pyspark.ml.linalg import Vectors model40 = DecisionTreeClassificationModel() model141 = model40.load('E:/kaggle/model22') model141.depth model141.numFeatures app = Flask(__name__) @app.route('/api', methods=('GET', 'POST')) def make_predict(): print('hi, good morning ... ') data = request.get_json(force=True) print(data) predict_df = spark.createDataFrame([(1, Vectors.dense(data))], ['index', 'Features']) predict_df.show() output = model141.transform(predict_df).select('prediction').first()[0]
# Write the cleased dataset to an s3 bucket in parquet format dataset.write.parquet("s3://expedia-hotel-recommendations-workflow/spark_OutputCleasedDataset.parquet") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = dataset.randomSplit([0.7, 0.3]) # Fit Decision Tree Algorithm dtc = DecisionTreeClassifier(labelCol="label", featuresCol="features") dtcm = dtc.fit(trainingData) # Save trained Logistic Regression Model to s3 Bucket for future use dtcm.save('s3://expedia-hotel-recommendations-workflow/dtcm_model') # Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use dtcModel = DecisionTreeClassificationModel.load("s3://expedia-hotel-recommendations-workflow/dtcm_model") # Make predictions with Decision Tree model on the Test Dataset dtcPredictions = dtcModel.transform(testData) # Calculate and print Accuracy score for Decision Tree Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") dtcAccuracy = evaluator.evaluate(dtcPredictions) print("Decision Tree accuracy Error = %g" % (dtcAccuracy)) # Calculate and print F1 score for Decision Tree Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="f1") dtcF1 = evaluator.evaluate(dtcPredictions) print("Decision Tree f1 Error = %g" % (dtcF1))
#load data data = None if dataType == "libsvm": data = sqlContext.read.format("libsvm").load(dataPath) #load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath) elif algoName == "GBTRegression": from pyspark.ml.regression import GBTRegressionModel model = GBTRegressionModel.load(modelPath)
def _get_root_node(tree: DecisionTreeClassificationModel): if hasattr(tree, 'trees'): return tree.trees[0]._call_java('rootNode') else: return tree._call_java('rootNode')