コード例 #1
0
ファイル: tools.py プロジェクト: JosemyDuarte/twitterJudge
def entrenar_spam(sc,
                  sql_context,
                  dir_spam,
                  dir_no_spam,
                  num_trees=20,
                  max_depth=8):
    input_spam = sc.textFile(dir_spam)
    input_no_spam = sc.textFile(dir_no_spam)

    spam = sql_context.read.json(input_spam).select("text").withColumn(
        "label", F.lit(1.0))
    no_spam = sql_context.read.json(input_no_spam).select("text").withColumn(
        "label", F.lit(0.0))

    training_data = spam.unionAll(no_spam)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(training_data)

    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=140)
    featurizedData = hashingTF.transform(wordsData)
    """idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)"""

    seed = 1800009193L
    (split_20_df, split_80_df) = featurizedData.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    rf = RandomForestClassifier().setLabelCol("label") \
        .setPredictionCol("predicted_label") \
        .setFeaturesCol("rawFeatures") \
        .setSeed(100088121L) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([rf])

    reg_eval = MulticlassClassificationEvaluator(
        predictionCol="predicted_label",
        labelCol="label",
        metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline,
                              evaluator=reg_eval,
                              numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)
    modelo = crossval.fit(training_set_df).bestModel

    predictions_and_labels_df = modelo.transform(test_set_df)

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    return modelo, accuracy
コード例 #2
0
def transform(df: DataFrame, debug: bool = False) -> DataFrame:
    feature_cols = [c for c in df.columns if "feat_" in c]

    stages = get_stages(feature_cols)
    pipeline = Pipeline()
    pipeline.setStages(stages)
    if debug:
        print(pipeline.explainParams())
    return pipeline.fit(df).transform(df)
コード例 #3
0
    def onehot_encode(self, df, features):
        indexers = [
            StringIndexer(inputCol=f, outputCol=f + "_indexed")
            for f in features
        ]
        pipeline = Pipeline(stages=indexers)
        indexed = pipeline.fit(df).transform(df)

        oh_indexers = [
            OneHotEncoder(inputCol=f + "_indexed",
                          outputCol=f + "_vector").setDropLast(False)
            for f in features
        ]
        pipeline.setStages(oh_indexers)
        oh_indexed = pipeline.fit(indexed).transform(indexed)

        return oh_indexed
コード例 #4
0
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor()

dt.setPredictionCol("Prediction_cuisine")\
  .setLabelCol("6714")\
  .setFeaturesCol("features")\
  .setMaxBins(100)

# Create a Pipeline
dtPipeline = Pipeline()

# Set the stages of the Pipeline
dtPipeline.setStages([vectorizer, dt])

# Let's first train on the entire dataset to see what we get
dtModel = dtPipeline.fit(trainingSetDF)

# COMMAND ----------

resultsDtDf = dtModel.transform(testSetDF)
resultsDtDf.write.save('/mnt/data/resultsDtDf.parquet',
                       format='parquet',
                       header=True,
                       mode="overwrite")

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
aft = DecisionTreeClassifier()
aft.setLabelCol("Readmitlabel")
aft.setMaxDepth(30)

print aft.explainParams()

# COMMAND ----------

from pyspark.ml import Pipeline

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

# Now we'll tell the pipeline to first create the feature vector, and then do the linear regression
lrPipeline.setStages([vecAssembler, aft])

# Pipelines are themselves Estimators -- so to use them we call fit:
lrPipelineModel = lrPipeline.fit(finaldf)

# COMMAND ----------

# DBTITLE 1,Using Model for data predicition
predictionsAndLabelsDF = lrPipelineModel.transform(finaldf)
predAnalysis = predictionsAndLabelsDF.select('age_05', 'age_15', 'age_25',
                                             'age_35', 'age_45', 'age_55',
                                             'age_65', 'age_75', 'age_85',
                                             'age_MISS', 'DaysInHospital',
                                             'prediction')
confusionMatrix = predictionsAndLabelsDF.select('Readmitlabel', 'prediction')
コード例 #6
0
ファイル: Sommelier.py プロジェクト: mmengarelli/notebooks
                            outputCol="user_id",
                            handleInvalid="skip")
titleIndexer = StringIndexer(inputCol="title",
                             outputCol="item_id",
                             handleInvalid="skip")

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="item_id",
          ratingCol="points",
          coldStartStrategy="drop",
          nonnegative=True)

pipeline = Pipeline()
pipeline.setStages([userIndexer, titleIndexer, als])

model = pipeline.fit(train)

# COMMAND ----------

# DBTITLE 0,Predict
predictions = model.transform(test)
predictions.createOrReplaceTempView("predictions")

# COMMAND ----------

# MAGIC %md ## Recommendations

# COMMAND ----------
コード例 #7
0
ファイル: pipeline.py プロジェクト: rageshn/spark
vectorizer.setOutputCol('features')

# splitting the dataset into training and test datasets in 80% - 20% ratio
seed = 1800009193
(testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed)
testSetDF.cache()
trainSetDF.cache()

# Create a Linear Regression Model
lr = LinearRegression()
# print(lr.explainParams())
lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1)

# Create a ML Pipeline and set the stages
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr])

# Train the model with training dataset
lrModel = lrPipeline.fit(trainSetDF)

# Get the intercept and co-efficients of the equation
intercept = lrModel.stages[1].intercept
weights = lrModel.stages[1].coefficients

# Get list of column names except output
features = [col for col in trainSetDF.columns if col != "Power_Output"]

coefficents = zip(weights, features)

# sort the coefficients from greatest absolute weight most to the least absolute weight
coefficents = sorted(coefficents, key=lambda tup: abs(tup[0]), reverse=True)
# extract from the pipeline before we can extract the model.summary information.  We'll see this in the cell below.
# Second, pipelines can add complexity when we use them with multiple different data sets.  We'll see this when
# we run against our holdout data two cells below.  As you will see, we have to drop the prediction column before
# we run, which makes our pipeline inefficient.

# Spark ML will return the best model.  To determine "best," we let Spark ML use its default measure for linear
# regression, which is Root Mean Squared Error (RMSE)
# We'll use MLflow to log the best model, so others can retrieve and use it.
os.environ['MLFLOW_TRACKING_URI'] = 'databricks'
os.environ['DATABRICKS_HOST'] = 'https://adb-8245268741408838.18.azuredatabricks.net'
os.environ['DATABRICKS_TOKEN'] = args.api_token
print(f"About to start mlflow with experiment_id {args.mlflow_experiment_id}")
with mlflow.start_run(experiment_id=args.mlflow_experiment_id) as run:
    print("mlflow run started")
    pipeline = Pipeline()
    pipeline.setStages([conditionIndexer, gradeIndexer, zipcodeIndexer, encoder, assembler, tvs])
    pipelineModel = pipeline.fit(df_input_training_and_test)

    # Log the best model
    print("About to log model to mlflow...")
    mlflow.spark.log_model(pipelineModel, "house-price-pipelineModel")
    print("... model logged to mlflow")

    best_run = run.info
    print(f"Best run: {best_run}")

# Print some interesting data about the best model
# Check model accuracy and chosen parameters

# To get the model, you need to access the TrainValidationSplit object in the pipeline, which is the 6th param
# (see the constructor above)
コード例 #9
0
sc = SparkContext(master='local', appName="HOME")

# Definir el 'df' Spark a utilizar
df = spark.createDataFrame([('line_1', 100, 10, 1), ('line_2', 200, 20, 2),
                            ('line_3', 300, 30, 2), ('line_4', 300, 30, 3),
                            ('line_5', 200, 20, 1), ('line_6', 100, 10, 1)],
                           ("label", "x1", "x2", "x3"))

# Definir un ensamblador de las columnas 'x1', 'x2' y 'x3' que toma como salida 'aux_features'
assembler = VectorAssembler(inputCols=["x1", "x2", "x3"], outputCol="features")

# Crear la tuberia
pipelineResult = Pipeline()
# Definir las etapas de las que está compuesta la tuberia
pipelineResult.setStages([assembler])

# Modelo de ajuste de la tuberia con los datos 'df' de entrada
modelResult = pipelineResult.fit(df)

# Realiza la transformación de los datos utilizando el modelo
result_df = modelResult.transform(df)

# Definir el modelo de k-means.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(result_df)

# Obtener la suma cuadrada de errores 'SSE'
SSE = model.computeCost(result_df)
print("Suma cuadrada de errores: " + str(SSE))
コード例 #10
0
    return result
    
num_nodes = 2

# data_path = "/home/hadoop/MillionSongSubset/data/A/A/A"
data_path = '/mnt/snap/data'
# TODO: fix with nested dir
# filenames = [os.path.join(data_path, filename) for filename in os.listdir(data_path)]
filenames = getListOfFiles(data_path)
rdd = sc.parallelize(filenames, num_nodes)
rdd1 = rdd.flatMap(lambda x: read_h5_to_list(x))
# TODO: modified with attribute name
col_name = ["artist familiarity", "artist hotttnesss", "artist id", "artist location", "artist mbtags", 
 "artist mbtags count", "artist name", "artist terms", "artist terms freq", "artist terms weight", 
 "danceability", "duration", "end of fade in", "energy", "key",
"key confidence", "loudness", "mode", "mode confidence", "release", 
 "segments confidence", "segments loudness max", "segments loudness max time", 
"segments pitches", "segments timbre", "similar artists", 
"song hotttnesss", "song id", "start of fade out", "tempo", "time signature", 
"time signature confidence", "title", "track id", "year"]

df1 = rdd1.toDF(col_name)
vectorizer.setInputCols(col_name)
vectorizer.setOutputCol("features")


lrPipeline = Pipeline()
kmeans = KMeans().setK(2).setSeed(1)
lrPipeline.setStages([vectorizer, kmeans])
model = lrPipeline.fit(df1)
コード例 #11
0
ModalDF = sqlContext.read.csv(TrainSource,header="True",inferSchema="True").selectExpr("*",ConditionExpr)

#Transforms Input columns into single ArrayList called features
vectorizer = VectorAssembler()
vectorizer.setInputCols(["Lat", "Long", "Ele","LocalTime"])
vectorizer.setOutputCol("features")

#Declaring objects for Each Regressions
lr0 = LogisticRegression(labelCol="Condition",predictionCol="Predicted_Cond",maxIter=100, regParam=0, family="multinomial")
lr1 = LinearRegression(labelCol="Temp",predictionCol="Predicted_Temp",maxIter=100,regParam=0.1)
lr2 = LinearRegression(labelCol="Pres",predictionCol="Predicted_Pres",maxIter=100,regParam=0.1)
lr3 = LinearRegression(labelCol="Humid",predictionCol="Predicted_Humid",maxIter=100,regParam=0.1)

#Combining all the Regression in a pipeline and fit the Dataset to create a Modal
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr1, lr2,lr3,lr0])
lrModel = lrPipeline.fit(ModalDF)

# COMMAND ----------

"""
The Following code Take the Test Dataset and perform following actions
    - Gets GeoInformation (Latitude, Longitude,Elevation)
    - Gets Monthly Data&Timestamps for each record
    - Predict the Temperature, Pressure, Humidity & Condition using Pipeline Model
    - Change the Fomat and write it in a file
"""

# Extract and Transform TestRDD
TestRDD = sc.textFile(TestSource).map(lambda line: list(get_geo_info(line))).flatMap(lambda line: list(get_datetime_info(line)))
コード例 #12
0
from pyspark.ml.feature import VectorAssembler
from pyspark import SparkConf, SparkContext
sc = SparkContext(master='local',appName="HOME")
# Definir el df Spark a utilizar
df = spark.createDataFrame([
    ('line_1', 1, 2, 3, 4),
    ('line_2', 5, 6, 7, 8),
    ('line_3', 9, 9, 9, 9)
], ("label", "x1", "x2", "x3", "x4"))

# Definir un ensamblador de las columnas 'x1' y 'x2' y tomar como salida 'features1'
assembler12 = VectorAssembler(inputCols=["x1", "x2"], outputCol="features1")
# Crear la tuberia
pipeline12 = Pipeline()
# Definir las etapas de las que está compuesta la tuberia
pipeline12.setStages([assembler12])

# Definir un ensamblador de las columnas 'x3' y 'x4' y tomar como salida 'features2'
assembler34 = VectorAssembler(inputCols=["x3", "x4"], outputCol="features2")
# Crear la tuberia
pipeline34 = Pipeline()
# Definir las etapas de las que está compuesta la tuberia
pipeline34.setStages([assembler34])

# Definir un ensamblador de las columnas 'features1' y 'features2' y tomar como salida 'features'
assemblerResult = VectorAssembler(inputCols=["features1", "features2"], outputCol="features")
# Crear la tuberia
pipelineResult = Pipeline()
# Definir las etapas de las que está compuesta la tuberia
pipelineResult.setStages([pipeline12, pipeline34, assemblerResult])
コード例 #13
0
ファイル: tools.py プロジェクト: JosemyDuarte/twitterJudge
def entrenar_juez(sc,
                  sql_context,
                  juez_spam,
                  humanos,
                  ciborgs,
                  bots,
                  dir_juez,
                  mongo_uri=None,
                  num_trees=20,
                  max_depth=8):

    logger.info("Entrenando juez...")
    df_humanos = cargar_datos(sc, sql_context, humanos)
    df_bots = cargar_datos(sc, sql_context, bots)
    df_ciborgs = cargar_datos(sc, sql_context, ciborgs)

    tweets_humanos = df_para_tweets(df_humanos)
    tweets_bots = df_para_tweets(df_bots)
    tweets_ciborgs = df_para_tweets(df_ciborgs)

    tweets_df = tweets_humanos.union(tweets_bots).union(tweets_ciborgs)

    df_humanos = df_humanos.dropDuplicates(["user_id"])
    df_bots = df_bots.dropDuplicates(["user_id"])
    df_ciborgs = df_ciborgs.dropDuplicates(["user_id"])

    tweets = tweets_features(tweets_df, juez_spam)
    tweets.cache()

    usuarios_features_humanos = usuarios_features(df_humanos, 0.0)
    usuarios_features_ciborgs = usuarios_features(df_bots, 1.0)
    usuarios_features_bots = usuarios_features(df_ciborgs, 2.0)

    usuarios = usuarios_features_ciborgs.union(usuarios_features_bots).union(
        usuarios_features_humanos).cache()

    set_datos = usuarios.join(tweets, tweets.user_id == usuarios.user_id).drop(
        tweets.user_id).fillna(0).cache()

    seed = 1800009193L
    (split_20_df, split_80_df) = set_datos.randomSplit([20.0, 80.0], seed)

    test_set_df = split_20_df.cache()
    training_set_df = split_80_df.cache()

    vectorizer = VectorAssembler()
    vectorizer.setInputCols([
        "ano_registro", "con_descripcion", "con_geo_activo",
        "con_imagen_default", "con_imagen_fondo", "con_perfil_verificado",
        "entropia", "followers_ratio", "n_favoritos", "n_listas", "n_tweets",
        "reputacion", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday", "Sunday", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
        "22", "23", "uso_mobil", "uso_terceros", "uso_web",
        "avg_diversidad_lex", "avg_long_tweets", "reply_ratio", "avg_hashtags",
        "mention_ratio", "avg_palabras", "avg_diversidad_palabras",
        "url_ratio", "avg_spam"
    ])

    vectorizer.setOutputCol("features")

    rf = RandomForestClassifier()

    rf.setLabelCol("categoria") \
        .setPredictionCol("Predicted_categoria") \
        .setFeaturesCol("features") \
        .setSeed(seed) \
        .setMaxDepth(max_depth) \
        .setNumTrees(num_trees)

    rf_pipeline = Pipeline()
    rf_pipeline.setStages([vectorizer, rf])

    reg_eval = MulticlassClassificationEvaluator(
        predictionCol="Predicted_categoria",
        labelCol="categoria",
        metricName="accuracy")

    crossval = CrossValidator(estimator=rf_pipeline,
                              evaluator=reg_eval,
                              numFolds=5)
    param_grid = ParamGridBuilder().addGrid(rf.maxBins, [50, 100]).build()
    crossval.setEstimatorParamMaps(param_grid)

    logger.info("Buscando el mejor modelo de RandomForest")

    rf_model = crossval.fit(training_set_df).bestModel

    logger.info("Guardando en juez")
    guardar_juez(rf_model, dir_juez)
    logger.info("Guardando set de entrenamiento")
    training_set_df.write.json(dir_juez + "_trainingset", mode="overwrite")

    logger.info("Guardando en Mongo el set de entrenamiento")

    if mongo_uri:
        training_set_df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri)

    logger.info("Evaluando set de prueba")

    predictions_and_labels_df = rf_model.transform(test_set_df)
    predictions_and_labels_df.cache()

    accuracy = reg_eval.evaluate(predictions_and_labels_df)

    logger.info("Calculando matriz de confusion")

    hh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    hb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    hc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 0)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    bh = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    bb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    bc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 1)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    ch = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 0)].count()
    cb = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 1)].count()
    cc = predictions_and_labels_df[
        (predictions_and_labels_df.categoria == 2)
        & (predictions_and_labels_df.Predicted_categoria == 2)].count()

    return rf_model, accuracy, [[hh, hb, hc], [bh, bb, bc], [ch, cb, cc]]
コード例 #14
0
df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0)
df_featured.printSchema()



training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label"))

vectorizer = VectorAssembler()
vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"])
vectorizer.setOutputCol("features")

# Let's initialize our linear regression learner
lr = LinearRegression()

lr.setPredictionCol("prediction")\
  .setMaxIter(100)\
  .setRegParam(0.1)

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer,lr])

lrModel = lrPipeline.fit(training_seti)

predicted_df = lrModel.transform(training_seti)
# display(predicted_df)

test_seti = df_featured.select(col("pro").alias("pro_lag1"), col("pre").alias("pre_lag1"),col("hour"), col("ts"))
predicted_test_df = lrModel.transform(test_seti)
コード例 #15
0
class SPARK_MODEL:

    #init model params
    def __init__(self, dataset, dataName, splitRatio, targetType,
                 targetVariable, split, nbSamples, goodClass, sparkModelsId,
                 sparkLearningMethods, sparkOptions, numClasses, extDataSet):
        self.dataset = dataset
        self.dataName = dataName
        self.splitRatio = splitRatio
        self.targetType = targetType
        self.targetVariable = targetVariable
        self.split = split
        self.nbSamples = nbSamples
        self.goodClass = goodClass
        self.sparkModelsId = sparkModelsId
        self.sparkLearningMethods = sparkLearningMethods
        self.sparkOptions = sparkOptions
        self.numClasses = numClasses
        self.extDataSet = extDataSet

    #rdd methods
    def _set_rdd(self, dataset):
        self._rdd = sc.textFile(dataset, 8)
        header = self._rdd.first()
        self._rdd = self._rdd.filter(lambda line: line != header)

        if self.targetType == 'classification':
            print "class"
            self._rdd = self._rdd.map(classParsePoint)
        else:
            self._rdd = self._rdd.map(regParsePoint)

        print self._rdd.first()

    def _get_rdd(self):
        return self._rdd

    def _get_rddTest(self):
        return self._rddTest

    def _get_rddTraining(self):
        return self._rddTraining

    def _get_rddModel(self):
        return self._rddModel

    #model building: rdd
    def _set_rddModel(self, _type, _SLA, data):
        if _type == 'regression':
            if _SLA == 'randomForest':
                self._rddModel = RandomForest.trainRegressor(
                    data,
                    categoricalFeaturesInfo={},
                    numTrees=int(self.sparkOptions[4]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity='variance',
                    maxDepth=int(self.sparkOptions[1]),
                    maxBins=32)
            else:
                self._rddModel = ""
        else:  #classification
            if _SLA == 'randomForest':
                print self.numClasses
                self._rddModel = RandomForest.trainClassifier(
                    data,
                    numClasses=self.numClasses,
                    categoricalFeaturesInfo={},
                    numTrees=int(self.sparkOptions[4]),
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2])
            else:
                self._rddModel = ""

    def splitData(self):
        if self.split != "ExternalValidation":
            (self._rddTest, self._rddTraining) = self._rdd.randomSplit(
                [1 - self.splitRatio, self.splitRatio])
        else:

            print "ExternalValidation"
            self._rddTraining = self._rdd

            self._rddTest = sc.textFile(self.extDataSet, 8)
            header = self._rddTest.first()
            self._rddTest = self._rddTest.filter(lambda line: line != header)

            if self.targetType == 'classification':
                self._rddTest = self._rddTest.map(classParsePoint)
            else:
                self._rddTest = self._rddTest.map(regParsePoint)

    #rdd/dataFrame method
    def rddToDataFrame(self, rdd):
        return rdd.toDF()

    def dataFrameToRdd(self, dataFrame):
        return dataFrame.rdd

    #dataFrame method
    def _set_dataFrame(self):
        self._dataFrame = sqlContext.read.format('csv').options(
            delimiter=';', header='true', inferschema='true',
            nullValue='').load(self.dataset)
        self._dataFrame = self._dataFrame.withColumn(
            self.targetVariable,
            self.dataFrame[self.targetVariable].cast("double"))

    def _get_dataFrame(self):
        return self._dataFrame

    def _get_dataFrameTest(self):
        return self._dataFrameTest

    def _get_dataFrameTraining(self):
        return self._dataFrameTraining

    def splitDataFrameData(self):
        if self.split != "ExternalValidation":
            (self._rddTest, self._rddTraining) = self.dataFrameToRdd(
                self._get_dataFrame()).randomSplit(
                    [1 - self.splitRatio, self.splitRatio])
        else:
            self.splitData()

        self._dataFrameTest = self._rddTest.toDF()
        self._dataFrameTraining = self._rddTraining.toDF()

    def _get_dataFrameModel(self):
        return self._dataFrameModel

    def _get_pipeline(self):
        return self._pipeline

    def _get_crossval(self):
        return self._crossval

    def _get_paramGrid(self):
        return self._paramGrid

    def _get_regEval(self):
        return self._regEval

    #model building: dataframe
    def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler):

        if _type == 'regression':
            if _SLA == 'randomForest':
                rf = RandomForestRegressor()
                rf.setLabelCol(self.targetVariable)\
                  .setPredictionCol("prediction")\
                  .setFeaturesCol("features")\
                  .setProbabilityCol("proba")\
                  .setSeed(100088121L)\
                  .setMaxDepth(int(self.sparkOptions[1]))\
                  .setMaxMemoryInMB(10000)\
                  .setFeatureSubsetStrategy(self.sparkOptions[5])
                self._regEval = RegressionEvaluator(
                    predictionCol="prediction",
                    labelCol=self.targetVariable,
                    metricName="rmse")

        else:  #classification
            if _SLA == 'randomForest':
                rf = RandomForestClassifier(
                    labelCol=self.targetVariable,
                    featuresCol="features",
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2],
                    probabilityCol="proba")
                if goodClass != '':
                    self.regEval = BinaryClassificationEvaluator(
                        labelCol=self.targetVariable,
                        metricName="areaUnderROC")
                else:
                    self.regEval = MulticlassClassificationEvaluator(
                        labelCol=self.targetVariable,
                        predictionCol="prediction",
                        metricName="accuracy")

        # Create a Pipeline
        self._pipeline = Pipeline()
        # Set the stages of the Pipeline #vecAssembler
        self._pipeline.setStages([vecAssembler, rf])
        # GridSearch
        self._paramGrid = (ParamGridBuilder().addGrid(
            rf.numTrees,
            [int(num) for num in self.sparkOptions[4].split(',')]).build())
        # Add the grid to the CrossValidator
        self._crossval = CrossValidator(estimator=self._pipeline,
                                        estimatorParamMaps=self._paramGrid,
                                        evaluator=self._regEval,
                                        numFolds=self.nbSamples)
        # Now let's find and return the best model
        self._dataFrameModel = self._crossval.fit(data).bestModel

        #to be removed
        #print rf.getNumTrees()
        #modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText))
        #print nbTrees
        # end TBR

        rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" +
                str(self.sparkModelsId[0]))

    #end function

    #model evaluation
    #classification
    def computeKappa(self, m):

        sum = np.sum(m)

        row = m.sum(axis=0)
        col = m.sum(axis=1)

        P0 = m.trace() / sum

        PE = np.sum((row[i] / sum) * (col[i] / sum) for i in range(m.shape[0]))
        return (P0 - PE) / (1 - PE)

    def computeBA(self, m):
        row = m.sum(axis=0)
        col = m.sum(axis=1)
        return np.sum(m[i][i] / col[i] for i in range(m.shape[0])) / m.shape[0]

    #rdd model evalution
    def getRddPredictionsLabels(self, model, test_data):
        predictions = model.predict(test_data.map(lambda r: r.features))
        return predictions.zip(test_data.map(lambda r: r.label))

    def printRddMulticlassClassificationMetrics(self, predictions_and_labels):
        metrics = MulticlassMetrics(predictions_and_labels)
        print "KAPPA=" + str(
            self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
        print "BA=" + str(
            self.computeBA(np.array(metrics.confusionMatrix().toArray())))
        CMarray = metrics.confusionMatrix().toArray()
        #CMstring = ','.join(['%.5f' % num for num in CMarray])
        print "CM=" + str(CMarray)

    def printRddBinaryClassificationMetrics(self, predictions_and_labels):
        metrics = BinaryClassificationMetrics(predictions_and_labels)
        print "KAPPA=" + str(
            self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
        print "BA=" + str(
            self.computeBA(np.array(metrics.confusionMatrix().toArray())))
        CMarray = metrics.confusionMatrix().toArray()
        #CMstring = ','.join(['%.5f' % num for num in CMarray])
        print "CM=" + str(CMarray)

    def evaluateRddClassificationModel(self):
        predictions_and_labels = self.getRddPredictionsLabels(
            self._get_rddModel(), self._get_rddTest())
        if self.goodClass != '':  #binary classification
            #self.printRddBinaryClassificationMetrics(predictions_and_labels)
            self.printRddMulticlassClassificationMetrics(
                predictions_and_labels)
        else:
            self.printRddMulticlassClassificationMetrics(
                predictions_and_labels)

    def evaluateRddRegressionModel(self):
        # Get predictions
        valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(),
                                                      self._get_rddTest())
        # Instantiate metrics object
        metrics = RegressionMetrics(valuesAndPreds)
        # Squared Error
        print("MSE = %s" % metrics.meanSquaredError)
        print("RMSE = %s" % metrics.rootMeanSquaredError)
        # R-squared
        print("R-squared = %s" % metrics.r2)
        # Mean absolute error
        print("MAE = %s" % metrics.meanAbsoluteError)
        # Explained variance
        print("Explained variance = %s" % metrics.explainedVariance)

    def evaluateDataFrameRegressionModel(self):
        # Now let's use rfModel to compute an evaluation metric for our test dataset: testSetDF
        predictionsAndLabelsDF = self._dataFrameModel.transform(
            self._dataFrameTest)

        # Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame
        rmseRF = self._regEval.evaluate(predictionsAndLabelsDF)

        # Now let's compute the r2 evaluation metric for our test dataset
        r2RF = self._regEval.evaluate(predictionsAndLabelsDF,
                                      {self._regEval.metricName: "r2"})

        print("RMSE = %s" % rmseRF)
        print("R-squared = %s " % r2RF)

    def evaluateDataFrameClassificationModel(self, sc):
        #here we have a problem
        a = 1

    #save models
    def saveRddModel(self, sc):
        #save rdd API model
        remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" +
                      self.sparkModelsId[0])
        modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str(
            self.sparkModelsId[0])
        self._rddModel.save(sc, modelPath)

    def saveDataFrameModel(self):
        #final model to save
        #self._dataFrameModel = self._pipeline.fit(self._dataFrame)
        self._dataFrameModel = self._crossval.fit(self._dataFrame).bestModel

        modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        nbTrees = int(re.sub('.*?([0-9]*) trees$', r'\1', modelText))
        print nbTrees

        #save data frame API model
        remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" +
                      self.sparkModelsId[0])
        modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str(
            self.sparkModelsId[0])
        self._dataFrameModel.save(modelPath)
        self._pipeline.save(modelPath + "_Pipeline")

    def buildRDDModel(self, sparkContext):

        print "RDD_MODEL"

        # init RDD from dataset
        self._set_rdd(self.dataset)
        # split into test - training set
        self.splitData()
        # save rddTest and rddTraining into CSV and copy to PLP server!

        #self._rddTest.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv')

        #self._rddTraining.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv')
        self._rddTraining.toDF().toPandas().to_csv(
            '/home/t752887/data/output/' + self.sparkModelsId[0] + '_' +
            self.dataName + '_training.csv')

        self._rddTest.toDF().toPandas().to_csv('/home/t752887/data/output/' +
                                               self.sparkModelsId[0] + '_' +
                                               self.dataName + '_test.csv')

        #lines = self._rddTest.map(toCSVLine)
        #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv')

        #lines = self._rddTraining.map(toCSVLine)
        #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv')

        #could become a loop of models
        if self.targetType == 'classification':
            self._set_rddModel('classification', 'randomForest',
                               self._get_rddTraining())

            self.evaluateRddClassificationModel()

            #final model to save
            self._set_rddModel('classification', 'randomForest',
                               self._get_rdd())

        #regression
        else:
            self._set_rddModel('regression', 'randomForest',
                               self._get_rddTraining())

            self.evaluateRddRegressionModel()

            #final model to save
            self._set_rddModel('regression', 'randomForest', self._get_rdd())

        #TODO: save the model
        self.saveRddModel(sparkContext)

    def buildDataFrameModel(self):
        # init dataframe from dataset
        self._set_dataFrame()
        # split into test - training set
        self.splitDataFrameData()

        #vector assembler
        ignore = [self.targetVariable]
        vecAssembler = VectorAssembler(inputCols=[
            x for x in self._dataFrameTraining.columns if x not in ignore
        ],
                                       outputCol="features")

        #dataFrame cross-validation Pipeline with model selection
        if self.targetType == 'regression':
            #build model on the data we pass
            self._set_dataFrameModel('regression', 'randomForest',
                                     self._get_dataFrameTraining(),
                                     vecAssembler)
            #evaluate best model
            self.evaluateDataFrameRegressionModel()
            # save the model
            self.saveDataFrameModel()

        else:
            #build model on the data we pass
            self._set_dataFrameModel('regression', 'randomForest',
                                     self._get_dataFrameTraining(),
                                     vecAssembler)
            #TODO evaluate best model
            self.evaluateDataFrameClassificationModel(sparkContext)
            #TODO save the model
            self.saveDataFrameModel(sparkContext)

    def performModelSelection(self):
        try:
            i = float(self.sparkOptions[4])
            return 0
        except (ValueError, TypeError):
            return 1

    dataFrame = property(_get_dataFrame, _set_dataFrame)
    dataFrameTest = property(_get_dataFrameTest)
    dataFrameTraining = property(_get_dataFrameTraining)
    dataFrameModel = property(_get_dataFrameModel, _set_dataFrameModel)
    pipeline = property(_get_pipeline)
    crossval = property(_get_crossval)
    paramGrid = property(_get_paramGrid)
    regEval = property(_get_regEval)

    rdd = property(_get_rdd, _set_rdd)
    rddTest = property(_get_rddTest)
    rddTraining = property(_get_rddTraining)
    rddModel = property(_get_rddModel, _set_rddModel)
コード例 #16
0
  .setLabelCol("PE")\		#col in df
  .setMaxIter(100)\
  .setRegParam(0.15)

###########################
# create a pipeline
# - pipeline contains a series of stages in sequential execution 
# - each stage either an estimator or a transformer 
# - pipeline.fit() may equal to one of the following:
#	* estimator.fit()
#	* transformer.transform()
# - the fitted model = pipelineModel 
###########################

lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr])		#2 stages in sequence: feature vector, 
lrModel = lrPipeline.fit(trainSetDF)		#return: pipelineModel

# get results from pipelineModel: must specify the stage, see below 
intercept = lrModel.stages[1].intercept		#recall lrModel is a pipelineModel, only the 2nd stage is the model fitting procedure, which gives intercept and coefficients 
weights = lrModel.stages[1].coefficients

#only keep feature columns, i.e. exclude prediction col 
featuresNoLabel = [col for col in datasetDF.columns if col != "PE"]

#merge weights and labels
coefficients = zip(weights, featureNoLabel)
#sort the coefficient from greatest absolute weight 
coefficient.sort(key=lambda tup: abs(tup[0]), reverse=True)

######################
コード例 #17
0
indexer = StringIndexer().setInputCol('ocean_proximity').setOutputCol('idx_ocean_proximity')
idxHousing = indexer.fit(renamedHousing).transform(renamedHousing)
idxHousing.show()

encoder = OneHotEncoder().setInputCol('idx_ocean_proximity').setOutputCol('one_hot_ocean_proximity')
ohHousing = encoder.fit(idxHousing).transform(idxHousing)
ohHousing.show()

#4
numPipeline = [imputer,va,scaler]
catPipeline = [indexer,encoder]
pipeline = Pipeline(stages=numPipeline)
newHousing = pipeline.fit(renamedHousing).transform(renamedHousing)
newHousing = newHousing.drop('features')
newHousing.show()
pipeline = pipeline.setStages(catPipeline)
newHousing = pipeline.fit(newHousing).transform(newHousing)
newHousing.show()


va2 = VectorAssembler().setInputCols(['scaled_features','one_hot_ocean_proximity']).setOutputCol('features')
dataset = va2.transform(newHousing).select("features","label")
#dataset.withColumnRenamed('final_features','features')
dataset.show(n=100,truncate=False)

#path_0 = "/Users/gaogao/PycharmProjects/pythonProject/feature.csv"
(trainingData, testData) = dataset.randomSplit([0.8, 0.2])

lr = LinearRegression()
lrModel = lr.fit(trainingData)
print("Coefficients: %s" % str(lrModel.coefficients))
コード例 #18
0
ファイル: SmishingMain.py プロジェクト: raishahnawaz/smishing
    print("Labeled Messages: ", indexed.count())
    # print ("OK Messages: ", indexed["label"]=="OK")
    # print ("FRAUD Messages: ", indexed.count(indexed["label"]=="FRAUD"))
    # print ("SPAM Messages: ", indexed.count(indexed["label"]=="SPAM"))

    indexed.cube("label").count().orderBy("label").na.drop(
        subset=["label"]).show()

    print("Indexed Schema: ", indexed.schema)
    print("Labeled Data: ", indexed.count())

    indexed.select("LabelIndex", "features").show()

    TruePipeline = Pipeline()
    TruePipeline.setStages([tokenizer, hashingTF, idf, indexer])
    TrueFraudsModel = TruePipeline.fit(
        TrueLabeledMessages.na.drop(subset=["body"]))

    TrueFraudsIndexed = TrueFraudsModel.transform(
        TrueLabeledMessages.na.drop(subset=["body"]))
    print("TrueFrauds Indexed Schema: ", TrueFraudsIndexed.schema)
    print("TrueFrauds Count: ", TrueFraudsIndexed.count())
    TrueFraudsIndexed.select("LabelIndex", "features").show()

    #Evaluation.NaiveBayesEvaluation(indexed)
    print("Evaluation With true labeled fraud")
    Evaluation.NaiveBayesEvaluation(TrueFraudsIndexed)

    #2  Classification with Subset
コード例 #19
0
def linear_regression_to_predict_number_of_deaths():
    """
    Worldwide cancer mortality figures are considered for females aged 20-70 years old.
        > The mean death numbers by age are visualised initially.
        > As the data is curved, a polynomial regression is performed on it with
        the intention of predicting death numbers in the 50-54 age range
    """
    # Relevant columns for 20-70yrs are Deaths[10-19]
    column_index = [str(x) for x in range(10, 20)]

    sum_columns = (', '.join([f'sum(df.Deaths{x})' for x in column_index]))
    cancer_mortality_data.createOrReplaceTempView('df')
    female_yearly_totals = helper.spark.sql(
        f'select df.Year, {sum_columns} from df '
        'where df.Sex=2 '
        'group by df.Year '
        'order by df.Year asc').toDF('Year',
                                     *[f'Deaths{x}' for x in column_index])
    print(female_yearly_totals.show())

    # Check Pearson Correlation Coefficient between independent variables and target variable (Deaths16)
    for i in female_yearly_totals.columns:
        if i != 'Year':
            correlation = female_yearly_totals.stat.corr('Deaths16', i)
            print(f'Correlation between {i} and Deaths16: {correlation}')

    # plot mean deaths by age to find out the shape of this dataset
    plot_df = female_yearly_totals.toPandas()
    plot_df.drop(
        columns=['Year'],
        inplace=True)  # drop the Year column as it's not needed for this graph
    x_axis = [x for x in range(0, len(plot_df.columns))]
    y_axis = [int(plot_df[y].mean()) for y in plot_df.columns]

    # get age range labels
    age_ranges = [
        helper.age_ranges.select('00').filter(
            (helper.age_ranges['index'] == str(x))).collect()[0][0]
        for x in column_index
    ]
    fig = plt.figure(figsize=(9, 6))  # set plotted figure size
    plt.xticks(np.arange(len(age_ranges)), age_ranges)
    plt.ylabel('Yearly Average Deaths')
    plt.xlabel('Age')
    plt.title('Cancer Deaths (Female, 20-70 years old)')
    plt.plot(x_axis, y_axis)
    ax = plt.gca()
    ax.yaxis.set_major_formatter(ticker.EngFormatter())
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    plt.savefig(f'{output_dir}/female_cancer_deaths.png')
    plt.clf()

    # split into training and test sets
    (training, test) = female_yearly_totals.randomSplit([.7, .3])
    training.cache()
    test.cache()

    # exclude 'Deaths16' (50-54 years old range) from features in training
    column_index.remove('16')
    vectorised = VectorAssembler(
        inputCols=[f'Deaths{x}' for x in column_index], outputCol='features')

    poly_expansion = PolynomialExpansion(degree=3,
                                         inputCol='features',
                                         outputCol='poly_features')

    # set label column to 'Deaths16' as this is the column value being predicted
    lr = LinearRegression(
        maxIter=10,
        regParam=0.5).setLabelCol('Deaths16').setPredictionCol('predicted')

    lr_pipeline = Pipeline()
    lr_pipeline.setStages([vectorised, poly_expansion, lr])

    # Fit the model
    model = lr_pipeline.fit(training)

    # predict using test data
    predictions = model.transform(test).select('Year', 'Deaths16',
                                               'poly_features', 'predicted')
    print(predictions.show())

    model_details = model.stages[2]
    print('_____________\nModel details:\n_____________')
    # Print the coefficients and intercept for generalized linear regression model
    print('Coefficients: ' + str(model_details.coefficients))
    print('Intercept: ' + str(model_details.intercept))

    # Summarize the model over the training set and print out some metrics
    summary = model_details.summary
    print('Coefficient Standard Errors: ' +
          str(summary.coefficientStandardErrors))
    print('T Values: ' + str(summary.tValues))
    print('P Values: ' + str(summary.pValues))
    print('r^2: ' + str(summary.r2))
    print('Mean Squared Error: ' + str(summary.meanSquaredError))
    print('Mean Absolute Error: ' + str(summary.meanAbsoluteError))
    print('Explained variance: ' + str(summary.explainedVariance))
    print('Degrees Of Freedom: ' + str(summary.degreesOfFreedom))
    print('Deviance Residuals: ' + str(summary.devianceResiduals))

    # Evaluation metrics for test dataset
    # Create an RMSE evaluator using the label and predicted columns
    reg_eval = RegressionEvaluator(predictionCol='predicted',
                                   labelCol='Deaths16',
                                   metricName='rmse')

    # Run the evaluator on the DataFrame
    print('_____________\nPrediction evaluation:\n_____________')
    rmse = reg_eval.evaluate(predictions)
    print(f'Root Mean Squared Error: {rmse}')

    # Mean Square Error
    mse = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mse'})
    print(f'Mean Square Error: {mse}')

    # Mean Absolute Error
    mae = reg_eval.evaluate(predictions, {reg_eval.metricName: 'mae'})
    print(f'Mean Absolute Error: {mae}')

    # r2 - coefficient of determination
    r2 = reg_eval.evaluate(predictions, {reg_eval.metricName: 'r2'})
    print(f'r^2: {r2}')