Ejemplo n.º 1
0
def getTopClusters(startDate, endDate, startTime, endTime, category):
    filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache()

    # Extract X, Y into feature vector
    vectorizer = VectorAssembler()
    vectorizer.setInputCols(["X", "Y"])
    vectorizer.setOutputCol("features")
    pointsDF = vectorizer.transform(filteredDF).cache()

    # Hierarchical K means
    bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7)
    model = bkm.fit(pointsDF)

    # RDD of (clusterIndex, size)
    clustersRDD = (model.transform(pointsDF)
                   .select("prediction").rdd
                   .map(lambda row: (row["prediction"], 1))
                   .reduceByKey(lambda a, c: a + c))

    clusters = model.clusterCenters()
    clusterRV = clustersRDD.collect()

    rv = []
    for ind, num in clusterRV:
        val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num}
        rv.append(val)

    return rv
Ejemplo n.º 2
0
def run(spark_session: pyspark.sql.session.SparkSession):
    bike_sharing = spark_session.read.csv("day.csv", header=True)

    bike_sharing01 = bike_sharing.select(
        bike_sharing.season.astype("int"),
        bike_sharing.yr.astype("int"),
        bike_sharing.mnth.astype("int"),
        bike_sharing.holiday.astype("int"),
        bike_sharing.weekday.astype("int"),
        bike_sharing.workingday.astype("int"),
        bike_sharing.weathersit.astype("int"),
        bike_sharing.temp.astype("double"),
        bike_sharing.atemp.astype("double"),
        bike_sharing.hum.astype("double"),
        bike_sharing.windspeed.astype("double"),
        bike_sharing.cnt.astype("int").alias("label")
    )

    assembler = VectorAssembler()
    assembler.setInputCols(bike_sharing01.columns[:-1])
    assembler.setOutputCol("features")
    train, test = bike_sharing01.randomSplit((0.7, 0.3))

    train01 = assembler.transform(train)

    train02 = train01.select("features", "label")

    lr = LinearRegression()

    model = lr.fit(train02)

    test2 = assembler.transform(test)
    test02 = test2.select("features", "label")
    out = model.transform(test02)

    e = RegressionEvaluator()
    e.evaluate(out, {e.metricName: "r2"})
    e.evaluate(out, {e.metricName: "rmse"})

    res = out.select(f.abs(f.col("label")-f.col("prediction")).alias("diff"))
    accs = res.select(f.when(f.col("diff") < 300, 1).otherwise(0).alias("is_accurate"))
    accs.limit(3).toPandas()
    accs.agg(f.mean("is_accurate").alias("accuracy")).toPandas()

    # using MinMaxScaler to scale features:
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    scaler_model = scaler.fit(train02)
    scaler_model.transform(train02)
Ejemplo n.º 3
0
df.select("name").show()
df.select(df['name'], df['age']+1).show()
df.filter(df['age']>21).show()
df.groupBy("age").count().show()	#count usually follows groupBy

#########################
# feature construction  #
#########################

#feature construction
from pyspark.ml.feature import VectorAssembler
#Get the DF to allow colum-wise operation 
datasetDF = sqlContext.table("plant_dataset")	
vectorizer = VectorAssembler()
vectorizer.setInputCols(["AT", "V", "AP", "RH"])	#combine several columns tgt to form a feature vector 
vectorizer.setOutputCol("features")		# name of output column


#splitting performed on df; split on df, not on table! 
(split15DF, split85DF) = datasetDF.randomSplit([0.15, 0.85], seed=1900009193L)
#cache the splitted datasets 
testSetDF = split85DF.cache()
trainSetDF = split15DF.cache()

###########################
# model building in spark #
###########################
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline
Ejemplo n.º 4
0
func =  udf (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'), DateType())
df = df.select('*', unix_timestamp(func(col('ts'))).alias("id"))

w = Window().partitionBy().orderBy(col("id"))
df_featured = df.select("*", hour(col("ts")).alias("hour"), date_format(col("ts"), 'EEEE').alias("weekday"), lag("pro").over(w).alias("pro_lag1"), lag("pre").over(w).alias("pre_lag1"))

df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0)
df_featured.printSchema()



training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label"))

vectorizer = VectorAssembler()
vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"])
vectorizer.setOutputCol("features")

# Let's initialize our linear regression learner
lr = LinearRegression()

lr.setPredictionCol("prediction")\
  .setMaxIter(100)\
  .setRegParam(0.1)

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer,lr])

lrModel = lrPipeline.fit(training_seti)

predicted_df = lrModel.transform(training_seti)
    "dsfs_ave", "dsfs_stdev", "CharlsonIndexI_max", "CharlsonIndexI_min",
    "CharlsonIndexI_ave", "CharlsonIndexI_range", "CharlsonIndexI_stdev",
    "pcg1", "pcg2", "pcg3", "pcg4", "pcg5", "pcg6", "pcg7", "pcg8", "pcg9",
    "pcg10", "pcg11", "pcg12", "pcg13", "pcg14", "pcg15", "pcg16", "pcg17",
    "pcg18", "pcg19", "pcg20", "pcg21", "pcg22", "pcg23", "pcg24", "pcg25",
    "pcg26", "pcg27", "pcg28", "pcg29", "pcg30", "pcg31", "pcg32", "pcg33",
    "pcg34", "pcg35", "pcg36", "pcg37", "pcg38", "pcg39", "pcg40", "pcg41",
    "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4",
    "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1",
    "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11",
    "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2",
    "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max",
    "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max",
    "labCount_min", "labCount_ave", "labcount_months"
])
vecAssembler.setOutputCol("features")
print vecAssembler.explainParams()

from pyspark.ml.classification import DecisionTreeClassifier

aft = DecisionTreeClassifier()
aft.setLabelCol("Readmitlabel")
aft.setMaxDepth(30)

print aft.explainParams()

# COMMAND ----------

from pyspark.ml import Pipeline

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
Ejemplo n.º 6
0
    StructField('Atmospheric_Temperature', DoubleType(), True),
    StructField('Vacuum_Speed', DoubleType(), True),
    StructField('Atmospheric_Pressure', DoubleType(), True),
    StructField('Relative_Humidity', DoubleType(), True),
    StructField('Power_Output', DoubleType(), True)
])

raw_data_df = spark.read.format("csv").option("delimiter", "\t").option("header", "true").\
    load("/home/ragesh/Data/Power_Plant_Data/power_plant_data", schema=power_data_schema)

# raw_data_df.show(10, truncate=False)

# Converts the list of columns into a single vector column
vectorizer = VectorAssembler()
vectorizer.setInputCols(['Atmospheric_Temperature', 'Vacuum_Speed', 'Atmospheric_Pressure', 'Relative_Humidity'])
vectorizer.setOutputCol('features')

# splitting the dataset into training and test datasets in 80% - 20% ratio
seed = 1800009193
(testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed)
testSetDF.cache()
trainSetDF.cache()

# Create a Linear Regression Model
lr = LinearRegression()
# print(lr.explainParams())
lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1)

# Create a ML Pipeline and set the stages
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr])
Ejemplo n.º 7
0
# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler, QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

labelIndexer = StringIndexer().setInputCol("FATALITY_TYPE").setOutputCol("label").fit(eventFatalitiesDf)

featureDiscretizer = QuantileDiscretizer(numBuckets=10, inputCol="FATALITY_AGE", outputCol="FAT_AGE_BIN").fit(eventFatalitiesDf)
featureIndexers = [StringIndexer().setInputCol(baseFeature).setOutputCol(baseFeature + "_IDX").fit(eventFatalitiesDf) for baseFeature in ["FATALITY_SEX", "FATALITY_LOCATION"]]

featureAssembler = VectorAssembler()
featureAssembler.setInputCols(["FATALITY_SEX_IDX", "FATALITY_LOCATION_IDX", "FAT_AGE_BIN"])
featureAssembler.setOutputCol("features")

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

gbtClassifier = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

pipelineStages = [labelIndexer, featureDiscretizer] + featureIndexers + [featureAssembler, gbtClassifier, labelConverter]
pipeline = Pipeline(stages=pipelineStages)

(trainingData, testData) = eventFatalitiesDf.randomSplit([0.7, 0.3])
model = pipeline.fit(trainingData)

predictions = model.transform(testData)
display(predictions.select("predictedLabel", "label", "features"))
Ejemplo n.º 8
0
def entrenar_juez(sc,
                  sql_context,
                  juez_spam,
                  humanos,
                  ciborgs,
                  bots,
                  dir_juez,
                  mongo_uri=None,
                  num_trees=20,
                  max_depth=8):

    logger.info("Entrenando juez...")
    df_humanos = cargar_datos(sc, sql_context, humanos)
    df_bots = cargar_datos(sc, sql_context, bots)
    df_ciborgs = cargar_datos(sc, sql_context, ciborgs)

    tweets_humanos = df_para_tweets(df_humanos)
    tweets_bots = df_para_tweets(df_bots)
    tweets_ciborgs = df_para_tweets(df_ciborgs)

    tweets_df = tweets_humanos.union(tweets_bots).union(tweets_ciborgs)

    df_humanos = df_humanos.dropDuplicates(["user_id"])
    df_bots = df_bots.dropDuplicates(["user_id"])
    df_ciborgs = df_ciborgs.dropDuplicates(["user_id"])

    tweets = tweets_features(tweets_df, juez_spam)
    tweets.cache()

    usuarios_features_humanos = usuarios_features(df_humanos, 0.0)
    usuarios_features_ciborgs = usuarios_features(df_bots, 1.0)
    usuarios_features_bots = usuarios_features(df_ciborgs, 2.0)

    usuarios = usuarios_features_ciborgs.union(usuarios_features_bots).union(
        usuarios_features_humanos).cache()

    set_datos = usuarios.join(tweets, tweets.user_id == usuarios.user_id).drop(
        tweets.user_id).fillna(0).cache()

    seed = 1800009193L
    (split_20_df, split_80_df) = set_datos.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    vectorizer = VectorAssembler()
    vectorizer.setInputCols([
        "ano_registro", "con_descripcion", "con_geo_activo",
        "con_imagen_default", "con_imagen_fondo", "con_perfil_verificado",
        "entropia", "followers_ratio", "n_favoritos", "n_listas", "n_tweets",
        "reputacion", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday", "Sunday", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
        "22", "23", "uso_mobil", "uso_terceros", "uso_web",
        "avg_diversidad_lex", "avg_long_tweets", "reply_ratio", "avg_hashtags",
        "mention_ratio", "avg_palabras", "avg_diversidad_palabras",
        "url_ratio", "avg_spam"
    ])

    vectorizer.setOutputCol("features")

    rf = RandomForestClassifier()

    rf.setLabelCol("categoria") \
        .setPredictionCol("Predicted_categoria") \
        .setFeaturesCol("features") \
        .setSeed(seed) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([vectorizer, rf])

    reg_eval = MulticlassClassificationEvaluator(
        predictionCol="Predicted_categoria",
        labelCol="categoria",
        metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline,
                              evaluator=reg_eval,
                              numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)

    logger.info("Buscando el mejor modelo de RandomForest")

    rf_model = crossval.fit(training_set_df).bestModel

    logger.info("Guardando en juez")
    guardar_juez(rf_model, dir_juez)
    logger.info("Guardando set de entrenamiento")
    training_set_df.write.json(dir_juez + "_trainingset", mode="overwrite")

    logger.info("Guardando en Mongo el set de entrenamiento")

    if mongo_uri:
        training_set_df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri)

    logger.info("Evaluando set de prueba")

    predictions_and_labels_df = rf_model.transform(test_set_df)
    predictions_and_labels_df.cache()

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    logger.info("Calculando matriz de confusion")

    hh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    hb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    hc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    bh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    bb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    bc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    ch = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    cb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    cc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    return rf_model, accuracy, [[hh, hb, hc], [bh, bb, bc], [ch, cb, cc]]
Ejemplo n.º 9
0
x_y_RH = sqlContext.sql("SELECT RH as RH, PE as PE from power_plant")
x_y_DF_RH = pd.DataFrame(x_y_RH.toPandas().sample(n=1000),columns=['RH','PE'])

x_y_DF_RH.plot(kind='scatter',x='RH',y='PE',color='yellow')
plt.show()

## Preparación de los datos para aprendizaje automático

# Utilizatremos el formato VectorAssembler()

datasetDF = sqlContext.table('power_plant')

vectorizer = VectorAssembler()
vectorizer.setInputCols(["AT", "V", "AP", "RH"])
vectorizer.setOutputCol("features") # Aquí guardaremos nuestra variable objetivo Potencia (PE)

# Dividimos el dataset en entrenamiento (80%) y test (20%)

seed = 1800009193
(split20DF, split80DF) = datasetDF.randomSplit([0.2,0.8],1800009193)
trainingSetDF = split80DF
testSetDF = split20DF

# Guardamos en cache los datos para agilizar los cáluclos

trainingSetDF.cache()
testSetDF.cache()

# Árboles de decisión