def entrenar_spam(sc, sql_context, dir_spam, dir_no_spam, num_trees=20, max_depth=8): input_spam = sc.textFile(dir_spam) input_no_spam = sc.textFile(dir_no_spam) spam = sql_context.read.json(input_spam).select("text").withColumn( "label", F.lit(1.0)) no_spam = sql_context.read.json(input_no_spam).select("text").withColumn( "label", F.lit(0.0)) training_data = spam.unionAll(no_spam) tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(training_data) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=140) featurizedData = hashingTF.transform(wordsData) """idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData)""" seed = 1800009193L (split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed) test_set_df = split_20_df.cache() training_set_df = split_80_df.cache() rf = RandomForestClassifier().setLabelCol("label") \ .setPredictionCol("predicted_label") \ .setFeaturesCol("rawFeatures") \ .setSeed(100088121L) \ .setMaxDepth(max_depth) \ .setNumTrees(num_trees) rf_pipeline = Pipeline() rf_pipeline.setStages([rf]) reg_eval = MulticlassClassificationEvaluator( predictionCol="predicted_label", labelCol="label", metricName="accuracy") crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5) param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build() crossval.setEstimatorParamMaps(param_grid) modelo = crossval.fit(training_set_df).bestModel predictions_and_labels_df = modelo.transform(test_set_df) accuracy = reg_eval.evaluate(predictions_and_labels_df) return modelo, accuracy
def transform(df: DataFrame, debug: bool = False) -> DataFrame: feature_cols = [c for c in df.columns if "feat_" in c] stages = get_stages(feature_cols) pipeline = Pipeline() pipeline.setStages(stages) if debug: print(pipeline.explainParams()) return pipeline.fit(df).transform(df)
def onehot_encode(self, df, features): indexers = [ StringIndexer(inputCol=f, outputCol=f + "_indexed") for f in features ] pipeline = Pipeline(stages=indexers) indexed = pipeline.fit(df).transform(df) oh_indexers = [ OneHotEncoder(inputCol=f + "_indexed", outputCol=f + "_vector").setDropLast(False) for f in features ] pipeline.setStages(oh_indexers) oh_indexed = pipeline.fit(indexed).transform(indexed) return oh_indexed
from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml import Pipeline # Create a DecisionTreeRegressor dt = DecisionTreeRegressor() dt.setPredictionCol("Prediction_cuisine")\ .setLabelCol("6714")\ .setFeaturesCol("features")\ .setMaxBins(100) # Create a Pipeline dtPipeline = Pipeline() # Set the stages of the Pipeline dtPipeline.setStages([vectorizer, dt]) # Let's first train on the entire dataset to see what we get dtModel = dtPipeline.fit(trainingSetDF) # COMMAND ---------- resultsDtDf = dtModel.transform(testSetDF) resultsDtDf.write.save('/mnt/data/resultsDtDf.parquet', format='parquet', header=True, mode="overwrite") # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
aft = DecisionTreeClassifier() aft.setLabelCol("Readmitlabel") aft.setMaxDepth(30) print aft.explainParams() # COMMAND ---------- from pyspark.ml import Pipeline # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() # Now we'll tell the pipeline to first create the feature vector, and then do the linear regression lrPipeline.setStages([vecAssembler, aft]) # Pipelines are themselves Estimators -- so to use them we call fit: lrPipelineModel = lrPipeline.fit(finaldf) # COMMAND ---------- # DBTITLE 1,Using Model for data predicition predictionsAndLabelsDF = lrPipelineModel.transform(finaldf) predAnalysis = predictionsAndLabelsDF.select('age_05', 'age_15', 'age_25', 'age_35', 'age_45', 'age_55', 'age_65', 'age_75', 'age_85', 'age_MISS', 'DaysInHospital', 'prediction') confusionMatrix = predictionsAndLabelsDF.select('Readmitlabel', 'prediction')
outputCol="user_id", handleInvalid="skip") titleIndexer = StringIndexer(inputCol="title", outputCol="item_id", handleInvalid="skip") als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="points", coldStartStrategy="drop", nonnegative=True) pipeline = Pipeline() pipeline.setStages([userIndexer, titleIndexer, als]) model = pipeline.fit(train) # COMMAND ---------- # DBTITLE 0,Predict predictions = model.transform(test) predictions.createOrReplaceTempView("predictions") # COMMAND ---------- # MAGIC %md ## Recommendations # COMMAND ----------
vectorizer.setOutputCol('features') # splitting the dataset into training and test datasets in 80% - 20% ratio seed = 1800009193 (testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed) testSetDF.cache() trainSetDF.cache() # Create a Linear Regression Model lr = LinearRegression() # print(lr.explainParams()) lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1) # Create a ML Pipeline and set the stages lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr]) # Train the model with training dataset lrModel = lrPipeline.fit(trainSetDF) # Get the intercept and co-efficients of the equation intercept = lrModel.stages[1].intercept weights = lrModel.stages[1].coefficients # Get list of column names except output features = [col for col in trainSetDF.columns if col != "Power_Output"] coefficents = zip(weights, features) # sort the coefficients from greatest absolute weight most to the least absolute weight coefficents = sorted(coefficents, key=lambda tup: abs(tup[0]), reverse=True)
# extract from the pipeline before we can extract the model.summary information. We'll see this in the cell below. # Second, pipelines can add complexity when we use them with multiple different data sets. We'll see this when # we run against our holdout data two cells below. As you will see, we have to drop the prediction column before # we run, which makes our pipeline inefficient. # Spark ML will return the best model. To determine "best," we let Spark ML use its default measure for linear # regression, which is Root Mean Squared Error (RMSE) # We'll use MLflow to log the best model, so others can retrieve and use it. os.environ['MLFLOW_TRACKING_URI'] = 'databricks' os.environ['DATABRICKS_HOST'] = 'https://adb-8245268741408838.18.azuredatabricks.net' os.environ['DATABRICKS_TOKEN'] = args.api_token print(f"About to start mlflow with experiment_id {args.mlflow_experiment_id}") with mlflow.start_run(experiment_id=args.mlflow_experiment_id) as run: print("mlflow run started") pipeline = Pipeline() pipeline.setStages([conditionIndexer, gradeIndexer, zipcodeIndexer, encoder, assembler, tvs]) pipelineModel = pipeline.fit(df_input_training_and_test) # Log the best model print("About to log model to mlflow...") mlflow.spark.log_model(pipelineModel, "house-price-pipelineModel") print("... model logged to mlflow") best_run = run.info print(f"Best run: {best_run}") # Print some interesting data about the best model # Check model accuracy and chosen parameters # To get the model, you need to access the TrainValidationSplit object in the pipeline, which is the 6th param # (see the constructor above)
sc = SparkContext(master='local', appName="HOME") # Definir el 'df' Spark a utilizar df = spark.createDataFrame([('line_1', 100, 10, 1), ('line_2', 200, 20, 2), ('line_3', 300, 30, 2), ('line_4', 300, 30, 3), ('line_5', 200, 20, 1), ('line_6', 100, 10, 1)], ("label", "x1", "x2", "x3")) # Definir un ensamblador de las columnas 'x1', 'x2' y 'x3' que toma como salida 'aux_features' assembler = VectorAssembler(inputCols=["x1", "x2", "x3"], outputCol="features") # Crear la tuberia pipelineResult = Pipeline() # Definir las etapas de las que está compuesta la tuberia pipelineResult.setStages([assembler]) # Modelo de ajuste de la tuberia con los datos 'df' de entrada modelResult = pipelineResult.fit(df) # Realiza la transformación de los datos utilizando el modelo result_df = modelResult.transform(df) # Definir el modelo de k-means. kmeans = KMeans().setK(3).setSeed(1) model = kmeans.fit(result_df) # Obtener la suma cuadrada de errores 'SSE' SSE = model.computeCost(result_df) print("Suma cuadrada de errores: " + str(SSE))
return result num_nodes = 2 # data_path = "/home/hadoop/MillionSongSubset/data/A/A/A" data_path = '/mnt/snap/data' # TODO: fix with nested dir # filenames = [os.path.join(data_path, filename) for filename in os.listdir(data_path)] filenames = getListOfFiles(data_path) rdd = sc.parallelize(filenames, num_nodes) rdd1 = rdd.flatMap(lambda x: read_h5_to_list(x)) # TODO: modified with attribute name col_name = ["artist familiarity", "artist hotttnesss", "artist id", "artist location", "artist mbtags", "artist mbtags count", "artist name", "artist terms", "artist terms freq", "artist terms weight", "danceability", "duration", "end of fade in", "energy", "key", "key confidence", "loudness", "mode", "mode confidence", "release", "segments confidence", "segments loudness max", "segments loudness max time", "segments pitches", "segments timbre", "similar artists", "song hotttnesss", "song id", "start of fade out", "tempo", "time signature", "time signature confidence", "title", "track id", "year"] df1 = rdd1.toDF(col_name) vectorizer.setInputCols(col_name) vectorizer.setOutputCol("features") lrPipeline = Pipeline() kmeans = KMeans().setK(2).setSeed(1) lrPipeline.setStages([vectorizer, kmeans]) model = lrPipeline.fit(df1)
ModalDF = sqlContext.read.csv(TrainSource,header="True",inferSchema="True").selectExpr("*",ConditionExpr) #Transforms Input columns into single ArrayList called features vectorizer = VectorAssembler() vectorizer.setInputCols(["Lat", "Long", "Ele","LocalTime"]) vectorizer.setOutputCol("features") #Declaring objects for Each Regressions lr0 = LogisticRegression(labelCol="Condition",predictionCol="Predicted_Cond",maxIter=100, regParam=0, family="multinomial") lr1 = LinearRegression(labelCol="Temp",predictionCol="Predicted_Temp",maxIter=100,regParam=0.1) lr2 = LinearRegression(labelCol="Pres",predictionCol="Predicted_Pres",maxIter=100,regParam=0.1) lr3 = LinearRegression(labelCol="Humid",predictionCol="Predicted_Humid",maxIter=100,regParam=0.1) #Combining all the Regression in a pipeline and fit the Dataset to create a Modal lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr1, lr2,lr3,lr0]) lrModel = lrPipeline.fit(ModalDF) # COMMAND ---------- """ The Following code Take the Test Dataset and perform following actions - Gets GeoInformation (Latitude, Longitude,Elevation) - Gets Monthly Data&Timestamps for each record - Predict the Temperature, Pressure, Humidity & Condition using Pipeline Model - Change the Fomat and write it in a file """ # Extract and Transform TestRDD TestRDD = sc.textFile(TestSource).map(lambda line: list(get_geo_info(line))).flatMap(lambda line: list(get_datetime_info(line)))
from pyspark.ml.feature import VectorAssembler from pyspark import SparkConf, SparkContext sc = SparkContext(master='local',appName="HOME") # Definir el df Spark a utilizar df = spark.createDataFrame([ ('line_1', 1, 2, 3, 4), ('line_2', 5, 6, 7, 8), ('line_3', 9, 9, 9, 9) ], ("label", "x1", "x2", "x3", "x4")) # Definir un ensamblador de las columnas 'x1' y 'x2' y tomar como salida 'features1' assembler12 = VectorAssembler(inputCols=["x1", "x2"], outputCol="features1") # Crear la tuberia pipeline12 = Pipeline() # Definir las etapas de las que está compuesta la tuberia pipeline12.setStages([assembler12]) # Definir un ensamblador de las columnas 'x3' y 'x4' y tomar como salida 'features2' assembler34 = VectorAssembler(inputCols=["x3", "x4"], outputCol="features2") # Crear la tuberia pipeline34 = Pipeline() # Definir las etapas de las que está compuesta la tuberia pipeline34.setStages([assembler34]) # Definir un ensamblador de las columnas 'features1' y 'features2' y tomar como salida 'features' assemblerResult = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") # Crear la tuberia pipelineResult = Pipeline() # Definir las etapas de las que está compuesta la tuberia pipelineResult.setStages([pipeline12, pipeline34, assemblerResult])
def entrenar_juez(sc, sql_context, juez_spam, humanos, ciborgs, bots, dir_juez, mongo_uri=None, num_trees=20, max_depth=8): logger.info("Entrenando juez...") df_humanos = cargar_datos(sc, sql_context, humanos) df_bots = cargar_datos(sc, sql_context, bots) df_ciborgs = cargar_datos(sc, sql_context, ciborgs) tweets_humanos = df_para_tweets(df_humanos) tweets_bots = df_para_tweets(df_bots) tweets_ciborgs = df_para_tweets(df_ciborgs) tweets_df = tweets_humanos.union(tweets_bots).union(tweets_ciborgs) df_humanos = df_humanos.dropDuplicates(["user_id"]) df_bots = df_bots.dropDuplicates(["user_id"]) df_ciborgs = df_ciborgs.dropDuplicates(["user_id"]) tweets = tweets_features(tweets_df, juez_spam) tweets.cache() usuarios_features_humanos = usuarios_features(df_humanos, 0.0) usuarios_features_ciborgs = usuarios_features(df_bots, 1.0) usuarios_features_bots = usuarios_features(df_ciborgs, 2.0) usuarios = usuarios_features_ciborgs.union(usuarios_features_bots).union( usuarios_features_humanos).cache() set_datos = usuarios.join(tweets, tweets.user_id == usuarios.user_id).drop( tweets.user_id).fillna(0).cache() seed = 1800009193L (split_20_df, split_80_df) = set_datos.randomSplit([20.0, 80.0], seed) test_set_df = split_20_df.cache() training_set_df = split_80_df.cache() vectorizer = VectorAssembler() vectorizer.setInputCols([ "ano_registro", "con_descripcion", "con_geo_activo", "con_imagen_default", "con_imagen_fondo", "con_perfil_verificado", "entropia", "followers_ratio", "n_favoritos", "n_listas", "n_tweets", "reputacion", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "uso_mobil", "uso_terceros", "uso_web", "avg_diversidad_lex", "avg_long_tweets", "reply_ratio", "avg_hashtags", "mention_ratio", "avg_palabras", "avg_diversidad_palabras", "url_ratio", "avg_spam" ]) vectorizer.setOutputCol("features") rf = RandomForestClassifier() rf.setLabelCol("categoria") \ .setPredictionCol("Predicted_categoria") \ .setFeaturesCol("features") \ .setSeed(seed) \ .setMaxDepth(max_depth) \ .setNumTrees(num_trees) rf_pipeline = Pipeline() rf_pipeline.setStages([vectorizer, rf]) reg_eval = MulticlassClassificationEvaluator( predictionCol="Predicted_categoria", labelCol="categoria", metricName="accuracy") crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5) param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build() crossval.setEstimatorParamMaps(param_grid) logger.info("Buscando el mejor modelo de RandomForest") rf_model = crossval.fit(training_set_df).bestModel logger.info("Guardando en juez") guardar_juez(rf_model, dir_juez) logger.info("Guardando set de entrenamiento") training_set_df.write.json(dir_juez + "_trainingset", mode="overwrite") logger.info("Guardando en Mongo el set de entrenamiento") if mongo_uri: training_set_df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri) logger.info("Evaluando set de prueba") predictions_and_labels_df = rf_model.transform(test_set_df) predictions_and_labels_df.cache() accuracy = reg_eval.evaluate(predictions_and_labels_df) logger.info("Calculando matriz de confusion") hh = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 0)].count() hb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 1)].count() hc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 2)].count() bh = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 0)].count() bb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 1)].count() bc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 2)].count() ch = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 0)].count() cb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 1)].count() cc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 2)].count() return rf_model, accuracy, [[hh, hb, hc], [bh, bb, bc], [ch, cb, cc]]
df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0) df_featured.printSchema() training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label")) vectorizer = VectorAssembler() vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"]) vectorizer.setOutputCol("features") # Let's initialize our linear regression learner lr = LinearRegression() lr.setPredictionCol("prediction")\ .setMaxIter(100)\ .setRegParam(0.1) # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() lrPipeline.setStages([vectorizer,lr]) lrModel = lrPipeline.fit(training_seti) predicted_df = lrModel.transform(training_seti) # display(predicted_df) test_seti = df_featured.select(col("pro").alias("pro_lag1"), col("pre").alias("pre_lag1"),col("hour"), col("ts")) predicted_test_df = lrModel.transform(test_seti)
class SPARK_MODEL: #init model params def __init__(self, dataset, dataName, splitRatio, targetType, targetVariable, split, nbSamples, goodClass, sparkModelsId, sparkLearningMethods, sparkOptions, numClasses, extDataSet): self.dataset = dataset self.dataName = dataName self.splitRatio = splitRatio self.targetType = targetType self.targetVariable = targetVariable self.split = split self.nbSamples = nbSamples self.goodClass = goodClass self.sparkModelsId = sparkModelsId self.sparkLearningMethods = sparkLearningMethods self.sparkOptions = sparkOptions self.numClasses = numClasses self.extDataSet = extDataSet #rdd methods def _set_rdd(self, dataset): self._rdd = sc.textFile(dataset, 8) header = self._rdd.first() self._rdd = self._rdd.filter(lambda line: line != header) if self.targetType == 'classification': print "class" self._rdd = self._rdd.map(classParsePoint) else: self._rdd = self._rdd.map(regParsePoint) print self._rdd.first() def _get_rdd(self): return self._rdd def _get_rddTest(self): return self._rddTest def _get_rddTraining(self): return self._rddTraining def _get_rddModel(self): return self._rddModel #model building: rdd def _set_rddModel(self, _type, _SLA, data): if _type == 'regression': if _SLA == 'randomForest': self._rddModel = RandomForest.trainRegressor( data, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), featureSubsetStrategy=self.sparkOptions[5], impurity='variance', maxDepth=int(self.sparkOptions[1]), maxBins=32) else: self._rddModel = "" else: #classification if _SLA == 'randomForest': print self.numClasses self._rddModel = RandomForest.trainClassifier( data, numClasses=self.numClasses, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2]) else: self._rddModel = "" def splitData(self): if self.split != "ExternalValidation": (self._rddTest, self._rddTraining) = self._rdd.randomSplit( [1 - self.splitRatio, self.splitRatio]) else: print "ExternalValidation" self._rddTraining = self._rdd self._rddTest = sc.textFile(self.extDataSet, 8) header = self._rddTest.first() self._rddTest = self._rddTest.filter(lambda line: line != header) if self.targetType == 'classification': self._rddTest = self._rddTest.map(classParsePoint) else: self._rddTest = self._rddTest.map(regParsePoint) #rdd/dataFrame method def rddToDataFrame(self, rdd): return rdd.toDF() def dataFrameToRdd(self, dataFrame): return dataFrame.rdd #dataFrame method def _set_dataFrame(self): self._dataFrame = sqlContext.read.format('csv').options( delimiter=';', header='true', inferschema='true', nullValue='').load(self.dataset) self._dataFrame = self._dataFrame.withColumn( self.targetVariable, self.dataFrame[self.targetVariable].cast("double")) def _get_dataFrame(self): return self._dataFrame def _get_dataFrameTest(self): return self._dataFrameTest def _get_dataFrameTraining(self): return self._dataFrameTraining def splitDataFrameData(self): if self.split != "ExternalValidation": (self._rddTest, self._rddTraining) = self.dataFrameToRdd( self._get_dataFrame()).randomSplit( [1 - self.splitRatio, self.splitRatio]) else: self.splitData() self._dataFrameTest = self._rddTest.toDF() self._dataFrameTraining = self._rddTraining.toDF() def _get_dataFrameModel(self): return self._dataFrameModel def _get_pipeline(self): return self._pipeline def _get_crossval(self): return self._crossval def _get_paramGrid(self): return self._paramGrid def _get_regEval(self): return self._regEval #model building: dataframe def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler): if _type == 'regression': if _SLA == 'randomForest': rf = RandomForestRegressor() rf.setLabelCol(self.targetVariable)\ .setPredictionCol("prediction")\ .setFeaturesCol("features")\ .setProbabilityCol("proba")\ .setSeed(100088121L)\ .setMaxDepth(int(self.sparkOptions[1]))\ .setMaxMemoryInMB(10000)\ .setFeatureSubsetStrategy(self.sparkOptions[5]) self._regEval = RegressionEvaluator( predictionCol="prediction", labelCol=self.targetVariable, metricName="rmse") else: #classification if _SLA == 'randomForest': rf = RandomForestClassifier( labelCol=self.targetVariable, featuresCol="features", maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2], probabilityCol="proba") if goodClass != '': self.regEval = BinaryClassificationEvaluator( labelCol=self.targetVariable, metricName="areaUnderROC") else: self.regEval = MulticlassClassificationEvaluator( labelCol=self.targetVariable, predictionCol="prediction", metricName="accuracy") # Create a Pipeline self._pipeline = Pipeline() # Set the stages of the Pipeline #vecAssembler self._pipeline.setStages([vecAssembler, rf]) # GridSearch self._paramGrid = (ParamGridBuilder().addGrid( rf.numTrees, [int(num) for num in self.sparkOptions[4].split(',')]).build()) # Add the grid to the CrossValidator self._crossval = CrossValidator(estimator=self._pipeline, estimatorParamMaps=self._paramGrid, evaluator=self._regEval, numFolds=self.nbSamples) # Now let's find and return the best model self._dataFrameModel = self._crossval.fit(data).bestModel #to be removed #print rf.getNumTrees() #modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) #print nbTrees # end TBR rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" + str(self.sparkModelsId[0])) #end function #model evaluation #classification def computeKappa(self, m): sum = np.sum(m) row = m.sum(axis=0) col = m.sum(axis=1) P0 = m.trace() / sum PE = np.sum((row[i] / sum) * (col[i] / sum) for i in range(m.shape[0])) return (P0 - PE) / (1 - PE) def computeBA(self, m): row = m.sum(axis=0) col = m.sum(axis=1) return np.sum(m[i][i] / col[i] for i in range(m.shape[0])) / m.shape[0] #rdd model evalution def getRddPredictionsLabels(self, model, test_data): predictions = model.predict(test_data.map(lambda r: r.features)) return predictions.zip(test_data.map(lambda r: r.label)) def printRddMulticlassClassificationMetrics(self, predictions_and_labels): metrics = MulticlassMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray) def printRddBinaryClassificationMetrics(self, predictions_and_labels): metrics = BinaryClassificationMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray) def evaluateRddClassificationModel(self): predictions_and_labels = self.getRddPredictionsLabels( self._get_rddModel(), self._get_rddTest()) if self.goodClass != '': #binary classification #self.printRddBinaryClassificationMetrics(predictions_and_labels) self.printRddMulticlassClassificationMetrics( predictions_and_labels) else: self.printRddMulticlassClassificationMetrics( predictions_and_labels) def evaluateRddRegressionModel(self): # Get predictions valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(), self._get_rddTest()) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2) # Mean absolute error print("MAE = %s" % metrics.meanAbsoluteError) # Explained variance print("Explained variance = %s" % metrics.explainedVariance) def evaluateDataFrameRegressionModel(self): # Now let's use rfModel to compute an evaluation metric for our test dataset: testSetDF predictionsAndLabelsDF = self._dataFrameModel.transform( self._dataFrameTest) # Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame rmseRF = self._regEval.evaluate(predictionsAndLabelsDF) # Now let's compute the r2 evaluation metric for our test dataset r2RF = self._regEval.evaluate(predictionsAndLabelsDF, {self._regEval.metricName: "r2"}) print("RMSE = %s" % rmseRF) print("R-squared = %s " % r2RF) def evaluateDataFrameClassificationModel(self, sc): #here we have a problem a = 1 #save models def saveRddModel(self, sc): #save rdd API model remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" + self.sparkModelsId[0]) modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str( self.sparkModelsId[0]) self._rddModel.save(sc, modelPath) def saveDataFrameModel(self): #final model to save #self._dataFrameModel = self._pipeline.fit(self._dataFrame) self._dataFrameModel = self._crossval.fit(self._dataFrame).bestModel modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() nbTrees = int(re.sub('.*?([0-9]*) trees$', r'\1', modelText)) print nbTrees #save data frame API model remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" + self.sparkModelsId[0]) modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str( self.sparkModelsId[0]) self._dataFrameModel.save(modelPath) self._pipeline.save(modelPath + "_Pipeline") def buildRDDModel(self, sparkContext): print "RDD_MODEL" # init RDD from dataset self._set_rdd(self.dataset) # split into test - training set self.splitData() # save rddTest and rddTraining into CSV and copy to PLP server! #self._rddTest.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv') #self._rddTraining.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv') self._rddTraining.toDF().toPandas().to_csv( '/home/t752887/data/output/' + self.sparkModelsId[0] + '_' + self.dataName + '_training.csv') self._rddTest.toDF().toPandas().to_csv('/home/t752887/data/output/' + self.sparkModelsId[0] + '_' + self.dataName + '_test.csv') #lines = self._rddTest.map(toCSVLine) #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv') #lines = self._rddTraining.map(toCSVLine) #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv') #could become a loop of models if self.targetType == 'classification': self._set_rddModel('classification', 'randomForest', self._get_rddTraining()) self.evaluateRddClassificationModel() #final model to save self._set_rddModel('classification', 'randomForest', self._get_rdd()) #regression else: self._set_rddModel('regression', 'randomForest', self._get_rddTraining()) self.evaluateRddRegressionModel() #final model to save self._set_rddModel('regression', 'randomForest', self._get_rdd()) #TODO: save the model self.saveRddModel(sparkContext) def buildDataFrameModel(self): # init dataframe from dataset self._set_dataFrame() # split into test - training set self.splitDataFrameData() #vector assembler ignore = [self.targetVariable] vecAssembler = VectorAssembler(inputCols=[ x for x in self._dataFrameTraining.columns if x not in ignore ], outputCol="features") #dataFrame cross-validation Pipeline with model selection if self.targetType == 'regression': #build model on the data we pass self._set_dataFrameModel('regression', 'randomForest', self._get_dataFrameTraining(), vecAssembler) #evaluate best model self.evaluateDataFrameRegressionModel() # save the model self.saveDataFrameModel() else: #build model on the data we pass self._set_dataFrameModel('regression', 'randomForest', self._get_dataFrameTraining(), vecAssembler) #TODO evaluate best model self.evaluateDataFrameClassificationModel(sparkContext) #TODO save the model self.saveDataFrameModel(sparkContext) def performModelSelection(self): try: i = float(self.sparkOptions[4]) return 0 except (ValueError, TypeError): return 1 dataFrame = property(_get_dataFrame, _set_dataFrame) dataFrameTest = property(_get_dataFrameTest) dataFrameTraining = property(_get_dataFrameTraining) dataFrameModel = property(_get_dataFrameModel, _set_dataFrameModel) pipeline = property(_get_pipeline) crossval = property(_get_crossval) paramGrid = property(_get_paramGrid) regEval = property(_get_regEval) rdd = property(_get_rdd, _set_rdd) rddTest = property(_get_rddTest) rddTraining = property(_get_rddTraining) rddModel = property(_get_rddModel, _set_rddModel)
.setLabelCol("PE")\ #col in df .setMaxIter(100)\ .setRegParam(0.15) ########################### # create a pipeline # - pipeline contains a series of stages in sequential execution # - each stage either an estimator or a transformer # - pipeline.fit() may equal to one of the following: # * estimator.fit() # * transformer.transform() # - the fitted model = pipelineModel ########################### lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr]) #2 stages in sequence: feature vector, lrModel = lrPipeline.fit(trainSetDF) #return: pipelineModel # get results from pipelineModel: must specify the stage, see below intercept = lrModel.stages[1].intercept #recall lrModel is a pipelineModel, only the 2nd stage is the model fitting procedure, which gives intercept and coefficients weights = lrModel.stages[1].coefficients #only keep feature columns, i.e. exclude prediction col featuresNoLabel = [col for col in datasetDF.columns if col != "PE"] #merge weights and labels coefficients = zip(weights, featureNoLabel) #sort the coefficient from greatest absolute weight coefficient.sort(key=lambda tup: abs(tup[0]), reverse=True) ######################
indexer = StringIndexer().setInputCol('ocean_proximity').setOutputCol('idx_ocean_proximity') idxHousing = indexer.fit(renamedHousing).transform(renamedHousing) idxHousing.show() encoder = OneHotEncoder().setInputCol('idx_ocean_proximity').setOutputCol('one_hot_ocean_proximity') ohHousing = encoder.fit(idxHousing).transform(idxHousing) ohHousing.show() #4 numPipeline = [imputer,va,scaler] catPipeline = [indexer,encoder] pipeline = Pipeline(stages=numPipeline) newHousing = pipeline.fit(renamedHousing).transform(renamedHousing) newHousing = newHousing.drop('features') newHousing.show() pipeline = pipeline.setStages(catPipeline) newHousing = pipeline.fit(newHousing).transform(newHousing) newHousing.show() va2 = VectorAssembler().setInputCols(['scaled_features','one_hot_ocean_proximity']).setOutputCol('features') dataset = va2.transform(newHousing).select("features","label") #dataset.withColumnRenamed('final_features','features') dataset.show(n=100,truncate=False) #path_0 = "/Users/gaogao/PycharmProjects/pythonProject/feature.csv" (trainingData, testData) = dataset.randomSplit([0.8, 0.2]) lr = LinearRegression() lrModel = lr.fit(trainingData) print("Coefficients: %s" % str(lrModel.coefficients))
print("Labeled Messages: ", indexed.count()) # print ("OK Messages: ", indexed["label"]=="OK") # print ("FRAUD Messages: ", indexed.count(indexed["label"]=="FRAUD")) # print ("SPAM Messages: ", indexed.count(indexed["label"]=="SPAM")) indexed.cube("label").count().orderBy("label").na.drop( subset=["label"]).show() print("Indexed Schema: ", indexed.schema) print("Labeled Data: ", indexed.count()) indexed.select("LabelIndex", "features").show() TruePipeline = Pipeline() TruePipeline.setStages([tokenizer, hashingTF, idf, indexer]) TrueFraudsModel = TruePipeline.fit( TrueLabeledMessages.na.drop(subset=["body"])) TrueFraudsIndexed = TrueFraudsModel.transform( TrueLabeledMessages.na.drop(subset=["body"])) print("TrueFrauds Indexed Schema: ", TrueFraudsIndexed.schema) print("TrueFrauds Count: ", TrueFraudsIndexed.count()) TrueFraudsIndexed.select("LabelIndex", "features").show() #Evaluation.NaiveBayesEvaluation(indexed) print("Evaluation With true labeled fraud") Evaluation.NaiveBayesEvaluation(TrueFraudsIndexed) #2 Classification with Subset
def linear_regression_to_predict_number_of_deaths(): """ Worldwide cancer mortality figures are considered for females aged 20-70 years old. > The mean death numbers by age are visualised initially. > As the data is curved, a polynomial regression is performed on it with the intention of predicting death numbers in the 50-54 age range """ # Relevant columns for 20-70yrs are Deaths[10-19] column_index = [str(x) for x in range(10, 20)] sum_columns = (', '.join([f'sum(df.Deaths{x})' for x in column_index])) cancer_mortality_data.createOrReplaceTempView('df') female_yearly_totals = helper.spark.sql( f'select df.Year, {sum_columns} from df ' 'where df.Sex=2 ' 'group by df.Year ' 'order by df.Year asc').toDF('Year', *[f'Deaths{x}' for x in column_index]) print(female_yearly_totals.show()) # Check Pearson Correlation Coefficient between independent variables and target variable (Deaths16) for i in female_yearly_totals.columns: if i != 'Year': correlation = female_yearly_totals.stat.corr('Deaths16', i) print(f'Correlation between {i} and Deaths16: {correlation}') # plot mean deaths by age to find out the shape of this dataset plot_df = female_yearly_totals.toPandas() plot_df.drop( columns=['Year'], inplace=True) # drop the Year column as it's not needed for this graph x_axis = [x for x in range(0, len(plot_df.columns))] y_axis = [int(plot_df[y].mean()) for y in plot_df.columns] # get age range labels age_ranges = [ helper.age_ranges.select('00').filter( (helper.age_ranges['index'] == str(x))).collect()[0][0] for x in column_index ] fig = plt.figure(figsize=(9, 6)) # set plotted figure size plt.xticks(np.arange(len(age_ranges)), age_ranges) plt.ylabel('Yearly Average Deaths') plt.xlabel('Age') plt.title('Cancer Deaths (Female, 20-70 years old)') plt.plot(x_axis, y_axis) ax = plt.gca() ax.yaxis.set_major_formatter(ticker.EngFormatter()) if not os.path.exists(output_dir): os.mkdir(output_dir) plt.savefig(f'{output_dir}/female_cancer_deaths.png') plt.clf() # split into training and test sets (training, test) = female_yearly_totals.randomSplit([.7, .3]) training.cache() test.cache() # exclude 'Deaths16' (50-54 years old range) from features in training column_index.remove('16') vectorised = VectorAssembler( inputCols=[f'Deaths{x}' for x in column_index], outputCol='features') poly_expansion = PolynomialExpansion(degree=3, inputCol='features', outputCol='poly_features') # set label column to 'Deaths16' as this is the column value being predicted lr = LinearRegression( maxIter=10, regParam=0.5).setLabelCol('Deaths16').setPredictionCol('predicted') lr_pipeline = Pipeline() lr_pipeline.setStages([vectorised, poly_expansion, lr]) # Fit the model model = lr_pipeline.fit(training) # predict using test data predictions = model.transform(test).select('Year', 'Deaths16', 'poly_features', 'predicted') print(predictions.show()) model_details = model.stages[2] print('_____________\nModel details:\n_____________') # Print the coefficients and intercept for generalized linear regression model print('Coefficients: ' + str(model_details.coefficients)) print('Intercept: ' + str(model_details.intercept)) # Summarize the model over the training set and print out some metrics summary = model_details.summary print('Coefficient Standard Errors: ' + str(summary.coefficientStandardErrors)) print('T Values: ' + str(summary.tValues)) print('P Values: ' + str(summary.pValues)) print('r^2: ' + str(summary.r2)) print('Mean Squared Error: ' + str(summary.meanSquaredError)) print('Mean Absolute Error: ' + str(summary.meanAbsoluteError)) print('Explained variance: ' + str(summary.explainedVariance)) print('Degrees Of Freedom: ' + str(summary.degreesOfFreedom)) print('Deviance Residuals: ' + str(summary.devianceResiduals)) # Evaluation metrics for test dataset # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol='predicted', labelCol='Deaths16', metricName='rmse') # Run the evaluator on the DataFrame print('_____________\nPrediction evaluation:\n_____________') rmse = reg_eval.evaluate(predictions) print(f'Root Mean Squared Error: {rmse}') # Mean Square Error mse = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mse'}) print(f'Mean Square Error: {mse}') # Mean Absolute Error mae = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mae'}) print(f'Mean Absolute Error: {mae}') # r2 - coefficient of determination r2 = reg_eval.evaluate(predictions, {reg_eval.metricName: 'r2'}) print(f'r^2: {r2}')