def getTopClusters(startDate, endDate, startTime, endTime, category): filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache() # Extract X, Y into feature vector vectorizer = VectorAssembler() vectorizer.setInputCols(["X", "Y"]) vectorizer.setOutputCol("features") pointsDF = vectorizer.transform(filteredDF).cache() # Hierarchical K means bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7) model = bkm.fit(pointsDF) # RDD of (clusterIndex, size) clustersRDD = (model.transform(pointsDF) .select("prediction").rdd .map(lambda row: (row["prediction"], 1)) .reduceByKey(lambda a, c: a + c)) clusters = model.clusterCenters() clusterRV = clustersRDD.collect() rv = [] for ind, num in clusterRV: val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num} rv.append(val) return rv
def run(spark_session: pyspark.sql.session.SparkSession): bike_sharing = spark_session.read.csv("day.csv", header=True) bike_sharing01 = bike_sharing.select( bike_sharing.season.astype("int"), bike_sharing.yr.astype("int"), bike_sharing.mnth.astype("int"), bike_sharing.holiday.astype("int"), bike_sharing.weekday.astype("int"), bike_sharing.workingday.astype("int"), bike_sharing.weathersit.astype("int"), bike_sharing.temp.astype("double"), bike_sharing.atemp.astype("double"), bike_sharing.hum.astype("double"), bike_sharing.windspeed.astype("double"), bike_sharing.cnt.astype("int").alias("label") ) assembler = VectorAssembler() assembler.setInputCols(bike_sharing01.columns[:-1]) assembler.setOutputCol("features") train, test = bike_sharing01.randomSplit((0.7, 0.3)) train01 = assembler.transform(train) train02 = train01.select("features", "label") lr = LinearRegression() model = lr.fit(train02) test2 = assembler.transform(test) test02 = test2.select("features", "label") out = model.transform(test02) e = RegressionEvaluator() e.evaluate(out, {e.metricName: "r2"}) e.evaluate(out, {e.metricName: "rmse"}) res = out.select(f.abs(f.col("label")-f.col("prediction")).alias("diff")) accs = res.select(f.when(f.col("diff") < 300, 1).otherwise(0).alias("is_accurate")) accs.limit(3).toPandas() accs.agg(f.mean("is_accurate").alias("accuracy")).toPandas() # using MinMaxScaler to scale features: scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") scaler_model = scaler.fit(train02) scaler_model.transform(train02)
df.select("name").show() df.select(df['name'], df['age']+1).show() df.filter(df['age']>21).show() df.groupBy("age").count().show() #count usually follows groupBy ######################### # feature construction # ######################### #feature construction from pyspark.ml.feature import VectorAssembler #Get the DF to allow colum-wise operation datasetDF = sqlContext.table("plant_dataset") vectorizer = VectorAssembler() vectorizer.setInputCols(["AT", "V", "AP", "RH"]) #combine several columns tgt to form a feature vector vectorizer.setOutputCol("features") # name of output column #splitting performed on df; split on df, not on table! (split15DF, split85DF) = datasetDF.randomSplit([0.15, 0.85], seed=1900009193L) #cache the splitted datasets testSetDF = split85DF.cache() trainSetDF = split15DF.cache() ########################### # model building in spark # ########################### from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import LinearRegressionModel from pyspark.ml import Pipeline
func = udf (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'), DateType()) df = df.select('*', unix_timestamp(func(col('ts'))).alias("id")) w = Window().partitionBy().orderBy(col("id")) df_featured = df.select("*", hour(col("ts")).alias("hour"), date_format(col("ts"), 'EEEE').alias("weekday"), lag("pro").over(w).alias("pro_lag1"), lag("pre").over(w).alias("pre_lag1")) df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0) df_featured.printSchema() training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label")) vectorizer = VectorAssembler() vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"]) vectorizer.setOutputCol("features") # Let's initialize our linear regression learner lr = LinearRegression() lr.setPredictionCol("prediction")\ .setMaxIter(100)\ .setRegParam(0.1) # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() lrPipeline.setStages([vectorizer,lr]) lrModel = lrPipeline.fit(training_seti) predicted_df = lrModel.transform(training_seti)
"dsfs_ave", "dsfs_stdev", "CharlsonIndexI_max", "CharlsonIndexI_min", "CharlsonIndexI_ave", "CharlsonIndexI_range", "CharlsonIndexI_stdev", "pcg1", "pcg2", "pcg3", "pcg4", "pcg5", "pcg6", "pcg7", "pcg8", "pcg9", "pcg10", "pcg11", "pcg12", "pcg13", "pcg14", "pcg15", "pcg16", "pcg17", "pcg18", "pcg19", "pcg20", "pcg21", "pcg22", "pcg23", "pcg24", "pcg25", "pcg26", "pcg27", "pcg28", "pcg29", "pcg30", "pcg31", "pcg32", "pcg33", "pcg34", "pcg35", "pcg36", "pcg37", "pcg38", "pcg39", "pcg40", "pcg41", "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4", "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1", "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11", "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2", "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max", "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max", "labCount_min", "labCount_ave", "labcount_months" ]) vecAssembler.setOutputCol("features") print vecAssembler.explainParams() from pyspark.ml.classification import DecisionTreeClassifier aft = DecisionTreeClassifier() aft.setLabelCol("Readmitlabel") aft.setMaxDepth(30) print aft.explainParams() # COMMAND ---------- from pyspark.ml import Pipeline # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
StructField('Atmospheric_Temperature', DoubleType(), True), StructField('Vacuum_Speed', DoubleType(), True), StructField('Atmospheric_Pressure', DoubleType(), True), StructField('Relative_Humidity', DoubleType(), True), StructField('Power_Output', DoubleType(), True) ]) raw_data_df = spark.read.format("csv").option("delimiter", "\t").option("header", "true").\ load("/home/ragesh/Data/Power_Plant_Data/power_plant_data", schema=power_data_schema) # raw_data_df.show(10, truncate=False) # Converts the list of columns into a single vector column vectorizer = VectorAssembler() vectorizer.setInputCols(['Atmospheric_Temperature', 'Vacuum_Speed', 'Atmospheric_Pressure', 'Relative_Humidity']) vectorizer.setOutputCol('features') # splitting the dataset into training and test datasets in 80% - 20% ratio seed = 1800009193 (testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed) testSetDF.cache() trainSetDF.cache() # Create a Linear Regression Model lr = LinearRegression() # print(lr.explainParams()) lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1) # Create a ML Pipeline and set the stages lrPipeline = Pipeline() lrPipeline.setStages([vectorizer, lr])
# COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler, QuantileDiscretizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator labelIndexer = StringIndexer().setInputCol("FATALITY_TYPE").setOutputCol("label").fit(eventFatalitiesDf) featureDiscretizer = QuantileDiscretizer(numBuckets=10, inputCol="FATALITY_AGE", outputCol="FAT_AGE_BIN").fit(eventFatalitiesDf) featureIndexers = [StringIndexer().setInputCol(baseFeature).setOutputCol(baseFeature + "_IDX").fit(eventFatalitiesDf) for baseFeature in ["FATALITY_SEX", "FATALITY_LOCATION"]] featureAssembler = VectorAssembler() featureAssembler.setInputCols(["FATALITY_SEX_IDX", "FATALITY_LOCATION_IDX", "FAT_AGE_BIN"]) featureAssembler.setOutputCol("features") labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) gbtClassifier = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10) pipelineStages = [labelIndexer, featureDiscretizer] + featureIndexers + [featureAssembler, gbtClassifier, labelConverter] pipeline = Pipeline(stages=pipelineStages) (trainingData, testData) = eventFatalitiesDf.randomSplit([0.7, 0.3]) model = pipeline.fit(trainingData) predictions = model.transform(testData) display(predictions.select("predictedLabel", "label", "features"))
def entrenar_juez(sc, sql_context, juez_spam, humanos, ciborgs, bots, dir_juez, mongo_uri=None, num_trees=20, max_depth=8): logger.info("Entrenando juez...") df_humanos = cargar_datos(sc, sql_context, humanos) df_bots = cargar_datos(sc, sql_context, bots) df_ciborgs = cargar_datos(sc, sql_context, ciborgs) tweets_humanos = df_para_tweets(df_humanos) tweets_bots = df_para_tweets(df_bots) tweets_ciborgs = df_para_tweets(df_ciborgs) tweets_df = tweets_humanos.union(tweets_bots).union(tweets_ciborgs) df_humanos = df_humanos.dropDuplicates(["user_id"]) df_bots = df_bots.dropDuplicates(["user_id"]) df_ciborgs = df_ciborgs.dropDuplicates(["user_id"]) tweets = tweets_features(tweets_df, juez_spam) tweets.cache() usuarios_features_humanos = usuarios_features(df_humanos, 0.0) usuarios_features_ciborgs = usuarios_features(df_bots, 1.0) usuarios_features_bots = usuarios_features(df_ciborgs, 2.0) usuarios = usuarios_features_ciborgs.union(usuarios_features_bots).union( usuarios_features_humanos).cache() set_datos = usuarios.join(tweets, tweets.user_id == usuarios.user_id).drop( tweets.user_id).fillna(0).cache() seed = 1800009193L (split_20_df, split_80_df) = set_datos.randomSplit([20.0, 80.0], seed) test_set_df = split_20_df.cache() training_set_df = split_80_df.cache() vectorizer = VectorAssembler() vectorizer.setInputCols([ "ano_registro", "con_descripcion", "con_geo_activo", "con_imagen_default", "con_imagen_fondo", "con_perfil_verificado", "entropia", "followers_ratio", "n_favoritos", "n_listas", "n_tweets", "reputacion", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "uso_mobil", "uso_terceros", "uso_web", "avg_diversidad_lex", "avg_long_tweets", "reply_ratio", "avg_hashtags", "mention_ratio", "avg_palabras", "avg_diversidad_palabras", "url_ratio", "avg_spam" ]) vectorizer.setOutputCol("features") rf = RandomForestClassifier() rf.setLabelCol("categoria") \ .setPredictionCol("Predicted_categoria") \ .setFeaturesCol("features") \ .setSeed(seed) \ .setMaxDepth(max_depth) \ .setNumTrees(num_trees) rf_pipeline = Pipeline() rf_pipeline.setStages([vectorizer, rf]) reg_eval = MulticlassClassificationEvaluator( predictionCol="Predicted_categoria", labelCol="categoria", metricName="accuracy") crossval = CrossValidator(estimator=rf_pipeline, evaluator=reg_eval, numFolds=5) param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build() crossval.setEstimatorParamMaps(param_grid) logger.info("Buscando el mejor modelo de RandomForest") rf_model = crossval.fit(training_set_df).bestModel logger.info("Guardando en juez") guardar_juez(rf_model, dir_juez) logger.info("Guardando set de entrenamiento") training_set_df.write.json(dir_juez + "_trainingset", mode="overwrite") logger.info("Guardando en Mongo el set de entrenamiento") if mongo_uri: training_set_df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri) logger.info("Evaluando set de prueba") predictions_and_labels_df = rf_model.transform(test_set_df) predictions_and_labels_df.cache() accuracy = reg_eval.evaluate(predictions_and_labels_df) logger.info("Calculando matriz de confusion") hh = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 0)].count() hb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 1)].count() hc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 0) & (predictions_and_labels_df.Predicted_categoria == 2)].count() bh = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 0)].count() bb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 1)].count() bc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 1) & (predictions_and_labels_df.Predicted_categoria == 2)].count() ch = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 0)].count() cb = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 1)].count() cc = predictions_and_labels_df[ (predictions_and_labels_df.categoria == 2) & (predictions_and_labels_df.Predicted_categoria == 2)].count() return rf_model, accuracy, [[hh, hb, hc], [bh, bb, bc], [ch, cb, cc]]
x_y_RH = sqlContext.sql("SELECT RH as RH, PE as PE from power_plant") x_y_DF_RH = pd.DataFrame(x_y_RH.toPandas().sample(n=1000),columns=['RH','PE']) x_y_DF_RH.plot(kind='scatter',x='RH',y='PE',color='yellow') plt.show() ## Preparación de los datos para aprendizaje automático # Utilizatremos el formato VectorAssembler() datasetDF = sqlContext.table('power_plant') vectorizer = VectorAssembler() vectorizer.setInputCols(["AT", "V", "AP", "RH"]) vectorizer.setOutputCol("features") # Aquí guardaremos nuestra variable objetivo Potencia (PE) # Dividimos el dataset en entrenamiento (80%) y test (20%) seed = 1800009193 (split20DF, split80DF) = datasetDF.randomSplit([0.2,0.8],1800009193) trainingSetDF = split80DF testSetDF = split20DF # Guardamos en cache los datos para agilizar los cáluclos trainingSetDF.cache() testSetDF.cache() # Árboles de decisión