Exemple #1
0
def getTopClusters(startDate, endDate, startTime, endTime, category):
    filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache()

    # Extract X, Y into feature vector
    vectorizer = VectorAssembler()
    vectorizer.setInputCols(["X", "Y"])
    vectorizer.setOutputCol("features")
    pointsDF = vectorizer.transform(filteredDF).cache()

    # Hierarchical K means
    bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7)
    model = bkm.fit(pointsDF)

    # RDD of (clusterIndex, size)
    clustersRDD = (model.transform(pointsDF)
                   .select("prediction").rdd
                   .map(lambda row: (row["prediction"], 1))
                   .reduceByKey(lambda a, c: a + c))

    clusters = model.clusterCenters()
    clusterRV = clustersRDD.collect()

    rv = []
    for ind, num in clusterRV:
        val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num}
        rv.append(val)

    return rv
Exemple #2
0
def run(spark_session: pyspark.sql.session.SparkSession):
    bike_sharing = spark_session.read.csv("day.csv", header=True)

    bike_sharing01 = bike_sharing.select(
        bike_sharing.season.astype("int"),
        bike_sharing.yr.astype("int"),
        bike_sharing.mnth.astype("int"),
        bike_sharing.holiday.astype("int"),
        bike_sharing.weekday.astype("int"),
        bike_sharing.workingday.astype("int"),
        bike_sharing.weathersit.astype("int"),
        bike_sharing.temp.astype("double"),
        bike_sharing.atemp.astype("double"),
        bike_sharing.hum.astype("double"),
        bike_sharing.windspeed.astype("double"),
        bike_sharing.cnt.astype("int").alias("label")
    )

    assembler = VectorAssembler()
    assembler.setInputCols(bike_sharing01.columns[:-1])
    assembler.setOutputCol("features")
    train, test = bike_sharing01.randomSplit((0.7, 0.3))

    train01 = assembler.transform(train)

    train02 = train01.select("features", "label")

    lr = LinearRegression()

    model = lr.fit(train02)

    test2 = assembler.transform(test)
    test02 = test2.select("features", "label")
    out = model.transform(test02)

    e = RegressionEvaluator()
    e.evaluate(out, {e.metricName: "r2"})
    e.evaluate(out, {e.metricName: "rmse"})

    res = out.select(f.abs(f.col("label")-f.col("prediction")).alias("diff"))
    accs = res.select(f.when(f.col("diff") < 300, 1).otherwise(0).alias("is_accurate"))
    accs.limit(3).toPandas()
    accs.agg(f.mean("is_accurate").alias("accuracy")).toPandas()

    # using MinMaxScaler to scale features:
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    scaler_model = scaler.fit(train02)
    scaler_model.transform(train02)
Exemple #3
0
power_data_schema = StructType([
    StructField('Atmospheric_Temperature', DoubleType(), True),
    StructField('Vacuum_Speed', DoubleType(), True),
    StructField('Atmospheric_Pressure', DoubleType(), True),
    StructField('Relative_Humidity', DoubleType(), True),
    StructField('Power_Output', DoubleType(), True)
])

raw_data_df = spark.read.format("csv").option("delimiter", "\t").option("header", "true").\
    load("/home/ragesh/Data/Power_Plant_Data/power_plant_data", schema=power_data_schema)

# raw_data_df.show(10, truncate=False)

# Converts the list of columns into a single vector column
vectorizer = VectorAssembler()
vectorizer.setInputCols(['Atmospheric_Temperature', 'Vacuum_Speed', 'Atmospheric_Pressure', 'Relative_Humidity'])
vectorizer.setOutputCol('features')

# splitting the dataset into training and test datasets in 80% - 20% ratio
seed = 1800009193
(testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed)
testSetDF.cache()
trainSetDF.cache()

# Create a Linear Regression Model
lr = LinearRegression()
# print(lr.explainParams())
lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1)

# Create a ML Pipeline and set the stages
lrPipeline = Pipeline()
Exemple #4
0
# sql dataframe: select columns 
df.select("name").show()
df.select(df['name'], df['age']+1).show()
df.filter(df['age']>21).show()
df.groupBy("age").count().show()	#count usually follows groupBy

#########################
# feature construction  #
#########################

#feature construction
from pyspark.ml.feature import VectorAssembler
#Get the DF to allow colum-wise operation 
datasetDF = sqlContext.table("plant_dataset")	
vectorizer = VectorAssembler()
vectorizer.setInputCols(["AT", "V", "AP", "RH"])	#combine several columns tgt to form a feature vector 
vectorizer.setOutputCol("features")		# name of output column


#splitting performed on df; split on df, not on table! 
(split15DF, split85DF) = datasetDF.randomSplit([0.15, 0.85], seed=1900009193L)
#cache the splitted datasets 
testSetDF = split85DF.cache()
trainSetDF = split15DF.cache()

###########################
# model building in spark #
###########################
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline
# MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
pipeline = Pipeline()
assembler = VectorAssembler()

print assembler.explainParams()
print '\n', pipeline.explainParams()

# COMMAND ----------

# ANSWER
# Set assembler params
(assembler.setInputCols(['lengthFeatures',
                         'widthFeatures']).setOutputCol('featuresBucketized'))

pipeline.setStages([lengthBucketizer, widthBucketizer, assembler])
irisAssembled = pipeline.fit(irisSeparateFeatures).transform(
    irisSeparateFeatures)
display(irisAssembled)

# COMMAND ----------

# TEST
from pyspark.mllib.linalg import Vectors
firstAssembly = irisAssembled.select('lengthFeatures', 'widthFeatures',
                                     'featuresBucketized').first()
Test.assertTrue(
    all(firstAssembly[2].toArray() == [firstAssembly[0], firstAssembly[1]]),
    'incorrect value for column featuresBucketized')
Exemple #6
0
def read(cfg, spark):
    df = spark.read.load("/tmp/foo.parq/")
    assembler = VectorAssembler(outputCol="features")
    X = assembler.setInputCols(df.columns).transform(df).select("features")
    X = X.persist()
    return X
Exemple #7
0
df = df_prev.join(df_gen, df_prev.ts==df_gen.ts).drop(df_gen.ts).sort(col("ts"))
func =  udf (lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M'), DateType())
df = df.select('*', unix_timestamp(func(col('ts'))).alias("id"))

w = Window().partitionBy().orderBy(col("id"))
df_featured = df.select("*", hour(col("ts")).alias("hour"), date_format(col("ts"), 'EEEE').alias("weekday"), lag("pro").over(w).alias("pro_lag1"), lag("pre").over(w).alias("pre_lag1"))

df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0)
df_featured.printSchema()



training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label"))

vectorizer = VectorAssembler()
vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"])
vectorizer.setOutputCol("features")

# Let's initialize our linear regression learner
lr = LinearRegression()

lr.setPredictionCol("prediction")\
  .setMaxIter(100)\
  .setRegParam(0.1)

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer,lr])

lrModel = lrPipeline.fit(training_seti)
finaldf = indexed1.filter(col('trainset') == 1)

from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler()
vecAssembler.setInputCols([
    "age_05", "age_15", "age_25", "age_35", "age_45", "age_55", "age_65",
    "age_75", "age_85", "age_MISS", "sexMALE", "sexFEMALE", "sexMISS",
    "no_Claims", "no_Providers", "no_Vendors", "no_PCPs", "no_PlaceSvcs",
    "no_Specialities", "no_PrimaryConditionGroups", "no_ProcedureGroups",
    "PayDelay_max", "PayDelay_min", "PayDelay_ave", "PayDelay_stdev",
    "LOS_max", "LOS_min", "LOS_ave", "LOS_stdev", "LOS_TOT_UNKNOWN",
    "LOS_TOT_SUPRESSED", "LOS_TOT_KNOWN", "dsfs_max", "dsfs_min", "dsfs_range",
    "dsfs_ave", "dsfs_stdev", "CharlsonIndexI_max", "CharlsonIndexI_min",
    "CharlsonIndexI_ave", "CharlsonIndexI_range", "CharlsonIndexI_stdev",
    "pcg1", "pcg2", "pcg3", "pcg4", "pcg5", "pcg6", "pcg7", "pcg8", "pcg9",
    "pcg10", "pcg11", "pcg12", "pcg13", "pcg14", "pcg15", "pcg16", "pcg17",
    "pcg18", "pcg19", "pcg20", "pcg21", "pcg22", "pcg23", "pcg24", "pcg25",
    "pcg26", "pcg27", "pcg28", "pcg29", "pcg30", "pcg31", "pcg32", "pcg33",
    "pcg34", "pcg35", "pcg36", "pcg37", "pcg38", "pcg39", "pcg40", "pcg41",
    "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4",
    "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1",
    "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11",
    "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2",
    "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max",
    "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max",
    "labCount_min", "labCount_ave", "labcount_months"
])
vecAssembler.setOutputCol("features")
print vecAssembler.explainParams()

from pyspark.ml.classification import DecisionTreeClassifier
Exemple #9
0
# COMMAND ----------

(split15DF, split85DF) = parsed_df.randomSplit([0.15, 0.85], 1900009193L)
testSetDF = split15DF.cache()
trainingSetDF = split85DF.cache()

# COMMAND ----------

parsed_df.printSchema()

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler()
vectorizer.setInputCols(columns[:-2])
vectorizer.setOutputCol("features")

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor()

dt.setPredictionCol("Prediction_cuisine")\
  .setLabelCol("6714")\
  .setFeaturesCol("features")\
  .setMaxBins(100)
Exemple #10
0
meta_train_df =  building_metadata_df.join(train_df, (building_metadata_df['building_id'] == train_df['building_id']))
cond = [weather_train_df.site_id == meta_train_df.site_id, weather_train_df.timestamp == meta_train_df.timestamp]
trainDF =  weather_train_df.join(meta_train_df, cond)

# COMMAND ----------

datasetDF = trainDF.drop("timestamp", "site_id", "building_id")
datasetDF = datasetDF.na.fill(0)

# COMMAND ----------

# ***** vectorizer MODEL ****
from pyspark.ml.feature import VectorAssembler

vectorizer = VectorAssembler()
vectorizer.setInputCols(["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr", "sea_level_pressure", 
                         "wind_direction", "wind_speed", "square_feet", "year_built", "floor_count", "meter"])
vectorizer.setOutputCol("features")

# COMMAND ----------

split15DF, split85DF = datasetDF.randomSplit([15., 85.], seed=190)

# Let's cache these datasets for performance
testSetDF = split15DF#.cache()
trainingSetDF = split85DF#.cache()

# COMMAND ----------

# ***** LINEAR REGRESSION MODEL ****

from pyspark.ml.regression import LinearRegression
#display(parsed_df)
parsed_df.count()

# COMMAND ----------

all_columns_count = 6715
columns = map(str, range(0, all_columns_count))

len(columns[:-1])

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler()
feature_cols = columns[:-1]
vectorizer.setInputCols(feature_cols)
vectorizer.setOutputCol("features")

# COMMAND ----------

from pyspark.sql.functions import *

expr = [col(c).cast('Double').alias(c) for c in columns]
parsed_df = parsed_df.select(*expr)
df2 = vectorizer.transform(parsed_df)

# COMMAND ----------

# df2.show()
df3 = df2.select('features')
display(df3)
Exemple #12
0
    delimiter='\t', header='true',
    inferschema='true').load("/databricks-datasets/power-plant/data")
display(df)
#Definir una semilla
seed = 1800009193
# Generar un grupo de entrenamiento y otro de prueba con una proporción 80-20
(split20DF, split80DF) = df.randomSplit([.2, .8], seed=seed)

# Cachear los conjuntos de datos
testSetDF = split20DF.cache()
trainingSetDF = split80DF.cache()
display(trainingSetDF)

# Definir un vector de ensamblado para que las variables de entrada se queden en una sola "features"
vectorizer = VectorAssembler()
vectorizer.setInputCols(["AT", "V", "AP", "RH"])
vectorizer.setOutputCol("features")

# Definir molelo de arbol de regresión
dt = DecisionTreeRegressor()

# Definir los parametros del modelo:
# - Predicted_PE: columna que almacenará las predicciones estimadas
# - features: columna que almacena el vector de variables predictoras
# - PE: columna que almacena la predicción real
# - 8 niveles de profundidad
dt.setPredictionCol("Predicted_PE").setMaxBins(100).setFeaturesCol(
    "features").setLabelCol("PE").setMaxDepth(8)

# Crear una 'pipeline' en la cual hay 2 elementos,
# un 'Vector Assembler' y un modelo 'Decision Tree',
Exemple #13
0
# 2-5
housing_0 = housing \
    .withColumn("median_house_value", housing["median_house_value"].cast('float')) \
    .withColumn("total_rooms", housing["total_rooms"].cast('float')) \
    .withColumn("housing_median_age", housing["housing_median_age"].cast('float')) \
    .withColumn("population", housing["population"].cast('float')) \
    .withColumn("total_bedrooms", housing["total_bedrooms"].cast('float')) \
    .withColumn("longitude", housing["longitude"].cast('float')) \
    .withColumn("latitude", housing["latitude"].cast('float')) \
    .withColumn("households", housing["households"].cast('float')) \
    .withColumn("median_income", housing["median_income"].cast('float'))

housing_0.printSchema()
assembler = VectorAssembler(outputCol="features")
assembler.setInputCols(["median_house_value", "total_rooms", "housing_median_age", "population"])
features_housing_0 = assembler.transform(housing_0)
features_housing_0.show()
features_housing_0.select('features').show()

corr1 = Correlation.corr(features_housing_0, 'features', 'pearson').head()
print("pearson correlation matrix : " + str(corr1[0]))

# 2-6
housingCol1 = housing_0.withColumn('rooms_per_household', housing_0.total_rooms / housing_0.households)
housingCol2 = housingCol1.withColumn('bedrooms_per_room', housingCol1.total_bedrooms / housingCol1.total_rooms)
housingExtra = housingCol2.withColumn('population_per_household', housing_0.population / housing_0.households)
housingExtra.show(5)

# 3-1
renamedHousing = housingExtra.withColumnRenamed('median_house_value', 'label')
Exemple #14
0
    return result
    
num_nodes = 2

# data_path = "/home/hadoop/MillionSongSubset/data/A/A/A"
data_path = '/mnt/snap/data'
# TODO: fix with nested dir
# filenames = [os.path.join(data_path, filename) for filename in os.listdir(data_path)]
filenames = getListOfFiles(data_path)
rdd = sc.parallelize(filenames, num_nodes)
rdd1 = rdd.flatMap(lambda x: read_h5_to_list(x))
# TODO: modified with attribute name
col_name = ["artist familiarity", "artist hotttnesss", "artist id", "artist location", "artist mbtags", 
 "artist mbtags count", "artist name", "artist terms", "artist terms freq", "artist terms weight", 
 "danceability", "duration", "end of fade in", "energy", "key",
"key confidence", "loudness", "mode", "mode confidence", "release", 
 "segments confidence", "segments loudness max", "segments loudness max time", 
"segments pitches", "segments timbre", "similar artists", 
"song hotttnesss", "song id", "start of fade out", "tempo", "time signature", 
"time signature confidence", "title", "track id", "year"]

df1 = rdd1.toDF(col_name)
vectorizer.setInputCols(col_name)
vectorizer.setOutputCol("features")


lrPipeline = Pipeline()
kmeans = KMeans().setK(2).setSeed(1)
lrPipeline.setStages([vectorizer, kmeans])
model = lrPipeline.fit(df1)
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="ICD9_DGNS_CD_1", outputCol="ICD9_DGNS_CD_1N")
indexed = indexer.fit(train_data)
train_data = indexed.transform(train_data)
indexer = StringIndexer(inputCol="ICD9_DGNS_CD_2", outputCol="ICD9_DGNS_CD_2N")
indexed = indexer.fit(train_data)
train_data = indexed.transform(train_data)

train_data = train_data.select("ICD9_DGNS_CD_1N","ICD9_DGNS_CD_2N","ICD9_PRCDR_CD_1N","ICD9_PRCDR_CD_2N","TOTAL_BENEFICIARY_AMT")
train_data= train_data.na.fill({'TOTAL_BENEFICIARY_AMT':0})

from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler()
#datasetDF.select(datasetDF['PE'].alias('features')).show()
vectorizer.setInputCols(["ICD9_DGNS_CD_1N","ICD9_DGNS_CD_2N","ICD9_PRCDR_CD_1N","ICD9_PRCDR_CD_2N"])
vectorizer.setOutputCol("features")

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor(maxDepth = 8)

dt.setLabelCol("TOTAL_BENEFICIARY_AMT")\
  .setPredictionCol("Predicted_EXP")\
  .setFeaturesCol("features")\
  .setMaxBins(10000)


# Create a Pipeline
#Source and Result FileNames
TrainSource ="dbfs:/mnt/awshank/Test/training_results.csv"
TestSource="dbfs:/mnt/awshank/Test/target_locations.txt"
TestResult ="dbfs:/mnt/awshank/Test/target_results.psv"

#Condition Expression
ConditionExpr = "CASE WHEN Cond LIKE \'%Snow%\' THEN 0 WHEN Cond LIKE \'%Rain%\' THEN 1 ELSE 2 END AS Condition"
ConditionRevExpr = ["Snow","Rain","Sunny"]

#Read Source file and creates DataFrame
ModalDF = sqlContext.read.csv(TrainSource,header="True",inferSchema="True").selectExpr("*",ConditionExpr)

#Transforms Input columns into single ArrayList called features
vectorizer = VectorAssembler()
vectorizer.setInputCols(["Lat", "Long", "Ele","LocalTime"])
vectorizer.setOutputCol("features")

#Declaring objects for Each Regressions
lr0 = LogisticRegression(labelCol="Condition",predictionCol="Predicted_Cond",maxIter=100, regParam=0, family="multinomial")
lr1 = LinearRegression(labelCol="Temp",predictionCol="Predicted_Temp",maxIter=100,regParam=0.1)
lr2 = LinearRegression(labelCol="Pres",predictionCol="Predicted_Pres",maxIter=100,regParam=0.1)
lr3 = LinearRegression(labelCol="Humid",predictionCol="Predicted_Humid",maxIter=100,regParam=0.1)

#Combining all the Regression in a pipeline and fit the Dataset to create a Modal
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr1, lr2,lr3,lr0])
lrModel = lrPipeline.fit(ModalDF)

# COMMAND ----------
Exemple #17
0
# MAGIC   * Validate the model on test data

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler, QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

labelIndexer = StringIndexer().setInputCol("FATALITY_TYPE").setOutputCol("label").fit(eventFatalitiesDf)

featureDiscretizer = QuantileDiscretizer(numBuckets=10, inputCol="FATALITY_AGE", outputCol="FAT_AGE_BIN").fit(eventFatalitiesDf)
featureIndexers = [StringIndexer().setInputCol(baseFeature).setOutputCol(baseFeature + "_IDX").fit(eventFatalitiesDf) for baseFeature in ["FATALITY_SEX", "FATALITY_LOCATION"]]

featureAssembler = VectorAssembler()
featureAssembler.setInputCols(["FATALITY_SEX_IDX", "FATALITY_LOCATION_IDX", "FAT_AGE_BIN"])
featureAssembler.setOutputCol("features")

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

gbtClassifier = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

pipelineStages = [labelIndexer, featureDiscretizer] + featureIndexers + [featureAssembler, gbtClassifier, labelConverter]
pipeline = Pipeline(stages=pipelineStages)

(trainingData, testData) = eventFatalitiesDf.randomSplit([0.7, 0.3])
model = pipeline.fit(trainingData)

predictions = model.transform(testData)
display(predictions.select("predictedLabel", "label", "features"))
Exemple #18
0
def entrenar_juez(sc,
                  sql_context,
                  juez_spam,
                  humanos,
                  ciborgs,
                  bots,
                  dir_juez,
                  mongo_uri=None,
                  num_trees=20,
                  max_depth=8):

    logger.info("Entrenando juez...")
    df_humanos = cargar_datos(sc, sql_context, humanos)
    df_bots = cargar_datos(sc, sql_context, bots)
    df_ciborgs = cargar_datos(sc, sql_context, ciborgs)

    tweets_humanos = df_para_tweets(df_humanos)
    tweets_bots = df_para_tweets(df_bots)
    tweets_ciborgs = df_para_tweets(df_ciborgs)

    tweets_df = tweets_humanos.union(tweets_bots).union(tweets_ciborgs)

    df_humanos = df_humanos.dropDuplicates(["user_id"])
    df_bots = df_bots.dropDuplicates(["user_id"])
    df_ciborgs = df_ciborgs.dropDuplicates(["user_id"])

    tweets = tweets_features(tweets_df, juez_spam)
    tweets.cache()

    usuarios_features_humanos = usuarios_features(df_humanos, 0.0)
    usuarios_features_ciborgs = usuarios_features(df_bots, 1.0)
    usuarios_features_bots = usuarios_features(df_ciborgs, 2.0)

    usuarios = usuarios_features_ciborgs.union(usuarios_features_bots).union(
        usuarios_features_humanos).cache()

    set_datos = usuarios.join(tweets, tweets.user_id == usuarios.user_id).drop(
        tweets.user_id).fillna(0).cache()

    seed = 1800009193L
    (split_20_df, split_80_df) = set_datos.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    vectorizer = VectorAssembler()
    vectorizer.setInputCols([
        "ano_registro", "con_descripcion", "con_geo_activo",
        "con_imagen_default", "con_imagen_fondo", "con_perfil_verificado",
        "entropia", "followers_ratio", "n_favoritos", "n_listas", "n_tweets",
        "reputacion", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday", "Sunday", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
        "22", "23", "uso_mobil", "uso_terceros", "uso_web",
        "avg_diversidad_lex", "avg_long_tweets", "reply_ratio", "avg_hashtags",
        "mention_ratio", "avg_palabras", "avg_diversidad_palabras",
        "url_ratio", "avg_spam"
    ])

    vectorizer.setOutputCol("features")

    rf = RandomForestClassifier()

    rf.setLabelCol("categoria") \
        .setPredictionCol("Predicted_categoria") \
        .setFeaturesCol("features") \
        .setSeed(seed) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([vectorizer, rf])

    reg_eval = MulticlassClassificationEvaluator(
        predictionCol="Predicted_categoria",
        labelCol="categoria",
        metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline,
                              evaluator=reg_eval,
                              numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)

    logger.info("Buscando el mejor modelo de RandomForest")

    rf_model = crossval.fit(training_set_df).bestModel

    logger.info("Guardando en juez")
    guardar_juez(rf_model, dir_juez)
    logger.info("Guardando set de entrenamiento")
    training_set_df.write.json(dir_juez + "_trainingset", mode="overwrite")

    logger.info("Guardando en Mongo el set de entrenamiento")

    if mongo_uri:
        training_set_df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri)

    logger.info("Evaluando set de prueba")

    predictions_and_labels_df = rf_model.transform(test_set_df)
    predictions_and_labels_df.cache()

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    logger.info("Calculando matriz de confusion")

    hh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    hb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    hc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    bh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    bb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    bc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    ch = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    cb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    cc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    return rf_model, accuracy, [[hh, hb, hc], [bh, bb, bc], [ch, cb, cc]]