Example #1
0
def train_and_save_pipeline(model_path: str,
                            data_path: str,
                            estimator_class: type,
                            params: dict = ()):
    """
    Train and save a pipeline containing a single estimator with the specified parameters.

    This is the action called from the command line handler for a train subcommand.

    :param model_path: where to save the trained pipeline model
    :param data_path: data on which to train the model
    :param estimator_class: the estimator to train
    :param params: dictionary of optional estimator parameters
    """
    def snake_keys_to_camel(snake_dict: dict):
        def snake_to_camel(text: str):
            return re.sub(r"_([a-zA-Z0-9])", lambda m: m.group(1).upper(),
                          text)

        return dict((snake_to_camel(k), v) for k, v in snake_dict.items())

    data = spark().read.load(data_path)
    # Command names taken from Click parameters are lowercase underscore-delimited "snake-case" strings, while Spark
    # estimators take camel-case parameters.
    params = snake_keys_to_camel(dict(params))
    estimator = estimator_class().setParams(**params)
    pipeline = Pipeline(stages=[estimator]).fit(data)
    pipeline.save(model_path)
    logging.info(f"""Created pipeline model in {model_path}""")
Example #2
0
def main():

    # 1. Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*")

    #3. Transform data
    af = (text_file.map(getVals))

    #4. Create a DataFrame out of this using the toDF method and cache it
    afdf = af.toDF([
        'acousticness', 'danceability', 'energy', 'instrumentalness',
        'liveness', 'loudness', 'duration'
    ]).cache()

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf)

    #5. Create a train/test split with 70% of data in training set and 30% of data in test set
    afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123)

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(afdf_train)

    # Make predictions.
    predictions = model.transform(afdf_test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    rfModel = model.stages[1]
    print(rfModel)  # summary only

    #Step 3: Building our Pipelines

    rfModel.save('s3a://spotifybuck/model-export' +
                 datetime.now().strftime('%Y%m%d%H%M'))
    pipeline.save('s3a://spotifybuck/pipeline-export' +
                  datetime.now().strftime('%Y%m%d%H%M'))

    sc.stop()
Example #3
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #4
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #5
0
def test2():
    trA = MyTransformer()
    pipeA = Pipeline(stages=[trA])
    print type(pipeA)
    pipeA.save('testA.pipe')
    pipeAA = PysparkPipelineWrapper.unwrap(Pipeline.load('testA.pipe'))
    stagesAA = pipeAA.getStages()
    trAA = stagesAA[0]
    print trAA.dataset_count
Example #6
0
def test3():
    sparkSession = SparkSession.builder.master("local[*]").appName("test").config("spark.jars", "file:///E:/tmp/mysql-connector-java-5.1.39.jar").getOrCreate()
    dfA = make_a_dataframe(sparkSession.sparkContext)
    trA = MyTransformer()
    pipeA = Pipeline(stages=[trA]).fit(dfA)
    print type(pipeA)
    pipeA.save('testB.pipe')
    pipeAA = PysparkPipelineWrapper.unwrap(PipelineModel.load('testB.pipe'))
    stagesAA = pipeAA.stages
    trAA = stagesAA[0]
    print trAA.dataset_count
    dfB = pipeAA.transform(dfA)
    dfB.show()
Example #7
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param),
                             tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param),
                             loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance,
                             loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #8
0
    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #9
0
def test_save_pipeline(spark_context, classification_model):
    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(10)
    estimator.set_batch_size(10)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(10)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    pipeline.save('tmp')
Example #10
0
def kmeansresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2**20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    kmeans = KMeans(k=8,
                    seed=1,
                    featuresCol='rawFeatures',
                    maxIter=10,
                    initMode='random')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans])
    pipeline.save("KMeansPipeline")
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    # results.filter(results.prediction == 1).show(200,False)
    results.show()
    results.toPandas().to_csv(
        'kmeansresultsCanadaAndProductsAndDisastersAndClaritin.csv')
    model.stages[-1].save("KMeansModel")
Example #11
0
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

path = 'tmp/spark-logistic-regression-model'
model.save(path)

pipeline.save("tmp/unfit-lr-model")

sameModel = PipelineModel.load(path)



# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
    for column in response_cols:
        train_df = train_df.withColumn(column, encode_response(column))

        models.append(
            RandomForestClassifier(labelCol=column,
                                   featuresCol="scaledFeatures",
                                   numTrees=15).setPredictionCol(column +
                                                                 "_pred").
            setRawPredictionCol(column +
                                "_pred_raw").setProbabilityCol(column +
                                                               "_proba"))

    # create a list of all transformers
    stages = list()
    stages.extend(quantile_discretizers_numeric)
    stages.extend(string_indexer_categorical)
    stages.extend(id_feature_hashers)
    stages.extend(tweet_countVectorizers)
    stages.extend(doc2vecs)
    stages.append(feature_assambler)
    stages.append(scaler)
    stages.extend(models)
    # Create Pipeline
    pipeline = Pipeline(stages=stages)

    # Fit Pipeline and transform df
    pipeline = pipeline.fit(train_df)

    #pipeline.save("pipeline")
    pipeline.save("hdfs:///user/e1553958/RecSys/pipeline")
Example #13
0
    models = []
    for column in response_cols:
        train_df = train_df.withColumn(column, encode_response(column))

        models.append(LogisticRegression(labelCol=column, featuresCol="scaledFeatures",
                                            maxIter=1000,
                                            regParam=0.001,
                                            predictionCol=column + '_pred', 
                                            probabilityCol=column + '_proba',
                                            rawPredictionCol=column + '_pred_raw'))

    
    # create a list of all transformers
    stages = list()
    stages.extend(quantile_discretizers_numeric)
    stages.extend(string_indexer_categorical)
    stages.extend(id_feature_hashers)
    #stages.extend(tweet_countVectorizers)
    stages.append(feature_assambler)
    stages.append(scaler)
    stages.extend(models)
    # Create Pipeline
    pipeline = Pipeline(stages=stages)

    # Fit Pipeline and transform df
    pipeline = pipeline.fit(train_df)

    #pipeline.save("pipeline")
    pipeline.save("hdfs:///user/e1553958/RecSys/datasplit/pipeline_logReg")

    
def main():
    logger.info(f"Getting dataset from {path_to_train_dataset}...")
    client_storage = storage.Client()
    storage_bucket = client_storage.get_bucket(bucket)

    data = get_dataset().select(['sentiment', 'text'])
    logger.info(f"Current number of partitions: {data.rdd.getNumPartitions()}")
    data = data.repartition(10)
    logger.info(
        f"After repartition, number of partitions: {data.rdd.getNumPartitions()}"
    )

    logger.info(f"Creating pre processing transformers...")
    pt = PreprocessTransformer(inputCol='text', outputCol='text_clean')

    logger.info(f"Creating feature engineering transformers...")
    # statement = """
    # SELECT
    #     *
    # FROM
    #     __THIS__
    # WHERE
    #    text_clean != ''
    #
    # """
    # flt = SQLTransformer(statement=statement)
    tk = Tokenizer(inputCol='text', outputCol='words')
    ng1 = NGram(n=1, inputCol='words', outputCol='1_gr_words')
    ng2 = NGram(n=2, inputCol='words', outputCol='2_gr_words')
    ng3 = NGram(n=3, inputCol='words', outputCol='3_gr_words')
    statement = """
    SELECT
        *, concat(1_gr_words, 2_gr_words, 3_gr_words) c_words
    FROM
        __THIS__
    """
    cnt = SQLTransformer(statement=statement)
    cv = CountVectorizer(inputCol='c_words',
                         vocabSize=80000,
                         outputCol='features')

    logger.info(f"Split dataset...")
    df_train, df_test = data.randomSplit([0.8, 0.2], seed=100500)
    logger.info(
        f"Size of train dataset: {df_train.count()} and test dataset: {df_test.count()}"
    )

    logger.info(f"Building  and fitting model...")
    lr = LogisticRegression(featuresCol='features',
                            labelCol='sentiment',
                            maxIter=5000)
    pipeline_model = Pipeline(
        stages=[pt, tk, ng1, ng2, ng3, cnt, cv, lr]).fit(df_train)

    logger.info(f"Evaluating model...")
    ev = MulticlassClassificationEvaluator(labelCol='sentiment',
                                           metricName="accuracy",
                                           predictionCol='prediction')
    df_predict = pipeline_model.transform(df_test).cache()
    accuracy = ev.evaluate(df_predict)
    logger.info(f"Model accuracy: {accuracy}")

    logger.info(f"Storing model...")
    storage.Blob(f'models/{model_version}/scores',
                 storage_bucket).upload_from_string(f'"accuracy":{accuracy}')
    pipeline_model.save(f"gs://{bucket}/models/{model_version}/pipeline")
    sqlContext = SQLContext(sc)

    regex_tokenizer = RegexTokenizer(inputCol="text",
                                     outputCol="words",
                                     pattern="\\W")

    stop_words = []
    with open('/home/asdf/Documents/stopwords.txt', 'r') as contents:
        stop_words = contents.read().split()

    stop_words_remover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(stop_words)

    count_vectors = CountVectorizer(inputCol="filtered",
                                    outputCol="features",
                                    vocabSize=10000,
                                    minDF=5)

    lr = LogisticRegression(maxIter=100, regParam=0.01)
    nb = NaiveBayes(labelCol="label",
                    featuresCol="features",
                    smoothing=1.0,
                    modelType="multinomial")
    pipe1 = Pipeline(
        stages=[regex_tokenizer, stop_words_remover, count_vectors, lr])
    pipe2 = Pipeline(
        stages=[regex_tokenizer, stop_words_remover, count_vectors, nb])

    pipe1.save("models/lr")
    pipe2.save("models/nb")
assembler = VectorAssembler(
    inputCols = sparseVectorCols, 
    outputCol = 'features'
)
pipelineStages += [assembler]

normalizer = Normalizer(
    inputCol = 'features',
    outputCol = 'normFeatures'
)
pipelineStages += [normalizer]


pipeline = Pipeline(stages = pipelineStages)
pipelineModel = pipeline.fit(train_df)
train_df = pipelineModel.transform(train_df)

for col in COLUMNS_OHE:
    train_df.drop(col, col + '_VEC')

for col in COLUMNS_HIGH_CARD:
    train_df.drop(col, col + '_INDEX')

train_df.drop('features')

train_df = spark.createDataFrame(train_df.rdd, schema = train_df.schema)

pipelineModel.save(PIPELINE_DIR + 'pipeline_model_preprocess')
pipeline.save(PIPELINE_DIR + 'pipeline_preprocess')

train_df.write.parquet(WRITE_DIR + 'train_clean.parquet')
Example #17
0
class SPARK_MODEL:

    #init model params
    def __init__(self, dataset, dataName, splitRatio, targetType,
                 targetVariable, split, nbSamples, goodClass, sparkModelsId,
                 sparkLearningMethods, sparkOptions, numClasses, extDataSet):
        self.dataset = dataset
        self.dataName = dataName
        self.splitRatio = splitRatio
        self.targetType = targetType
        self.targetVariable = targetVariable
        self.split = split
        self.nbSamples = nbSamples
        self.goodClass = goodClass
        self.sparkModelsId = sparkModelsId
        self.sparkLearningMethods = sparkLearningMethods
        self.sparkOptions = sparkOptions
        self.numClasses = numClasses
        self.extDataSet = extDataSet

    #rdd methods
    def _set_rdd(self, dataset):
        self._rdd = sc.textFile(dataset, 8)
        header = self._rdd.first()
        self._rdd = self._rdd.filter(lambda line: line != header)

        if self.targetType == 'classification':
            print "class"
            self._rdd = self._rdd.map(classParsePoint)
        else:
            self._rdd = self._rdd.map(regParsePoint)

        print self._rdd.first()

    def _get_rdd(self):
        return self._rdd

    def _get_rddTest(self):
        return self._rddTest

    def _get_rddTraining(self):
        return self._rddTraining

    def _get_rddModel(self):
        return self._rddModel

    #model building: rdd
    def _set_rddModel(self, _type, _SLA, data):
        if _type == 'regression':
            if _SLA == 'randomForest':
                self._rddModel = RandomForest.trainRegressor(
                    data,
                    categoricalFeaturesInfo={},
                    numTrees=int(self.sparkOptions[4]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity='variance',
                    maxDepth=int(self.sparkOptions[1]),
                    maxBins=32)
            else:
                self._rddModel = ""
        else:  #classification
            if _SLA == 'randomForest':
                print self.numClasses
                self._rddModel = RandomForest.trainClassifier(
                    data,
                    numClasses=self.numClasses,
                    categoricalFeaturesInfo={},
                    numTrees=int(self.sparkOptions[4]),
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2])
            else:
                self._rddModel = ""

    def splitData(self):
        if self.split != "ExternalValidation":
            (self._rddTest, self._rddTraining) = self._rdd.randomSplit(
                [1 - self.splitRatio, self.splitRatio])
        else:

            print "ExternalValidation"
            self._rddTraining = self._rdd

            self._rddTest = sc.textFile(self.extDataSet, 8)
            header = self._rddTest.first()
            self._rddTest = self._rddTest.filter(lambda line: line != header)

            if self.targetType == 'classification':
                self._rddTest = self._rddTest.map(classParsePoint)
            else:
                self._rddTest = self._rddTest.map(regParsePoint)

    #rdd/dataFrame method
    def rddToDataFrame(self, rdd):
        return rdd.toDF()

    def dataFrameToRdd(self, dataFrame):
        return dataFrame.rdd

    #dataFrame method
    def _set_dataFrame(self):
        self._dataFrame = sqlContext.read.format('csv').options(
            delimiter=';', header='true', inferschema='true',
            nullValue='').load(self.dataset)
        self._dataFrame = self._dataFrame.withColumn(
            self.targetVariable,
            self.dataFrame[self.targetVariable].cast("double"))

    def _get_dataFrame(self):
        return self._dataFrame

    def _get_dataFrameTest(self):
        return self._dataFrameTest

    def _get_dataFrameTraining(self):
        return self._dataFrameTraining

    def splitDataFrameData(self):
        if self.split != "ExternalValidation":
            (self._rddTest, self._rddTraining) = self.dataFrameToRdd(
                self._get_dataFrame()).randomSplit(
                    [1 - self.splitRatio, self.splitRatio])
        else:
            self.splitData()

        self._dataFrameTest = self._rddTest.toDF()
        self._dataFrameTraining = self._rddTraining.toDF()

    def _get_dataFrameModel(self):
        return self._dataFrameModel

    def _get_pipeline(self):
        return self._pipeline

    def _get_crossval(self):
        return self._crossval

    def _get_paramGrid(self):
        return self._paramGrid

    def _get_regEval(self):
        return self._regEval

    #model building: dataframe
    def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler):

        if _type == 'regression':
            if _SLA == 'randomForest':
                rf = RandomForestRegressor()
                rf.setLabelCol(self.targetVariable)\
                  .setPredictionCol("prediction")\
                  .setFeaturesCol("features")\
                  .setProbabilityCol("proba")\
                  .setSeed(100088121L)\
                  .setMaxDepth(int(self.sparkOptions[1]))\
                  .setMaxMemoryInMB(10000)\
                  .setFeatureSubsetStrategy(self.sparkOptions[5])
                self._regEval = RegressionEvaluator(
                    predictionCol="prediction",
                    labelCol=self.targetVariable,
                    metricName="rmse")

        else:  #classification
            if _SLA == 'randomForest':
                rf = RandomForestClassifier(
                    labelCol=self.targetVariable,
                    featuresCol="features",
                    maxDepth=int(self.sparkOptions[1]),
                    featureSubsetStrategy=self.sparkOptions[5],
                    impurity=self.sparkOptions[2],
                    probabilityCol="proba")
                if goodClass != '':
                    self.regEval = BinaryClassificationEvaluator(
                        labelCol=self.targetVariable,
                        metricName="areaUnderROC")
                else:
                    self.regEval = MulticlassClassificationEvaluator(
                        labelCol=self.targetVariable,
                        predictionCol="prediction",
                        metricName="accuracy")

        # Create a Pipeline
        self._pipeline = Pipeline()
        # Set the stages of the Pipeline #vecAssembler
        self._pipeline.setStages([vecAssembler, rf])
        # GridSearch
        self._paramGrid = (ParamGridBuilder().addGrid(
            rf.numTrees,
            [int(num) for num in self.sparkOptions[4].split(',')]).build())
        # Add the grid to the CrossValidator
        self._crossval = CrossValidator(estimator=self._pipeline,
                                        estimatorParamMaps=self._paramGrid,
                                        evaluator=self._regEval,
                                        numFolds=self.nbSamples)
        # Now let's find and return the best model
        self._dataFrameModel = self._crossval.fit(data).bestModel

        #to be removed
        #print rf.getNumTrees()
        #modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText))
        #print nbTrees
        # end TBR

        rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" +
                str(self.sparkModelsId[0]))

    #end function

    #model evaluation
    #classification
    def computeKappa(self, m):

        sum = np.sum(m)

        row = m.sum(axis=0)
        col = m.sum(axis=1)

        P0 = m.trace() / sum

        PE = np.sum((row[i] / sum) * (col[i] / sum) for i in range(m.shape[0]))
        return (P0 - PE) / (1 - PE)

    def computeBA(self, m):
        row = m.sum(axis=0)
        col = m.sum(axis=1)
        return np.sum(m[i][i] / col[i] for i in range(m.shape[0])) / m.shape[0]

    #rdd model evalution
    def getRddPredictionsLabels(self, model, test_data):
        predictions = model.predict(test_data.map(lambda r: r.features))
        return predictions.zip(test_data.map(lambda r: r.label))

    def printRddMulticlassClassificationMetrics(self, predictions_and_labels):
        metrics = MulticlassMetrics(predictions_and_labels)
        print "KAPPA=" + str(
            self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
        print "BA=" + str(
            self.computeBA(np.array(metrics.confusionMatrix().toArray())))
        CMarray = metrics.confusionMatrix().toArray()
        #CMstring = ','.join(['%.5f' % num for num in CMarray])
        print "CM=" + str(CMarray)

    def printRddBinaryClassificationMetrics(self, predictions_and_labels):
        metrics = BinaryClassificationMetrics(predictions_and_labels)
        print "KAPPA=" + str(
            self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
        print "BA=" + str(
            self.computeBA(np.array(metrics.confusionMatrix().toArray())))
        CMarray = metrics.confusionMatrix().toArray()
        #CMstring = ','.join(['%.5f' % num for num in CMarray])
        print "CM=" + str(CMarray)

    def evaluateRddClassificationModel(self):
        predictions_and_labels = self.getRddPredictionsLabels(
            self._get_rddModel(), self._get_rddTest())
        if self.goodClass != '':  #binary classification
            #self.printRddBinaryClassificationMetrics(predictions_and_labels)
            self.printRddMulticlassClassificationMetrics(
                predictions_and_labels)
        else:
            self.printRddMulticlassClassificationMetrics(
                predictions_and_labels)

    def evaluateRddRegressionModel(self):
        # Get predictions
        valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(),
                                                      self._get_rddTest())
        # Instantiate metrics object
        metrics = RegressionMetrics(valuesAndPreds)
        # Squared Error
        print("MSE = %s" % metrics.meanSquaredError)
        print("RMSE = %s" % metrics.rootMeanSquaredError)
        # R-squared
        print("R-squared = %s" % metrics.r2)
        # Mean absolute error
        print("MAE = %s" % metrics.meanAbsoluteError)
        # Explained variance
        print("Explained variance = %s" % metrics.explainedVariance)

    def evaluateDataFrameRegressionModel(self):
        # Now let's use rfModel to compute an evaluation metric for our test dataset: testSetDF
        predictionsAndLabelsDF = self._dataFrameModel.transform(
            self._dataFrameTest)

        # Run the previously created RMSE evaluator, regEval, on the predictionsAndLabelsDF DataFrame
        rmseRF = self._regEval.evaluate(predictionsAndLabelsDF)

        # Now let's compute the r2 evaluation metric for our test dataset
        r2RF = self._regEval.evaluate(predictionsAndLabelsDF,
                                      {self._regEval.metricName: "r2"})

        print("RMSE = %s" % rmseRF)
        print("R-squared = %s " % r2RF)

    def evaluateDataFrameClassificationModel(self, sc):
        #here we have a problem
        a = 1

    #save models
    def saveRddModel(self, sc):
        #save rdd API model
        remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" +
                      self.sparkModelsId[0])
        modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str(
            self.sparkModelsId[0])
        self._rddModel.save(sc, modelPath)

    def saveDataFrameModel(self):
        #final model to save
        #self._dataFrameModel = self._pipeline.fit(self._dataFrame)
        self._dataFrameModel = self._crossval.fit(self._dataFrame).bestModel

        modelText = str(self._dataFrameModel.stages[-1])
        #._java_obj.toDebugString()
        nbTrees = int(re.sub('.*?([0-9]*) trees$', r'\1', modelText))
        print nbTrees

        #save data frame API model
        remove_folder("/home/t752887/python/myModelPath/SPARK_RF_Regression_" +
                      self.sparkModelsId[0])
        modelPath = "/home/t752887/python/myModelPath/SPARK_RF_Regression_" + str(
            self.sparkModelsId[0])
        self._dataFrameModel.save(modelPath)
        self._pipeline.save(modelPath + "_Pipeline")

    def buildRDDModel(self, sparkContext):

        print "RDD_MODEL"

        # init RDD from dataset
        self._set_rdd(self.dataset)
        # split into test - training set
        self.splitData()
        # save rddTest and rddTraining into CSV and copy to PLP server!

        #self._rddTest.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv')

        #self._rddTraining.toDF().write.csv('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv')
        self._rddTraining.toDF().toPandas().to_csv(
            '/home/t752887/data/output/' + self.sparkModelsId[0] + '_' +
            self.dataName + '_training.csv')

        self._rddTest.toDF().toPandas().to_csv('/home/t752887/data/output/' +
                                               self.sparkModelsId[0] + '_' +
                                               self.dataName + '_test.csv')

        #lines = self._rddTest.map(toCSVLine)
        #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_test.csv')

        #lines = self._rddTraining.map(toCSVLine)
        #lines.saveAsTextFile('/home/t752887/data/output/'+self.sparkModelsId[0]+'_'+self.dataName+'_training.csv')

        #could become a loop of models
        if self.targetType == 'classification':
            self._set_rddModel('classification', 'randomForest',
                               self._get_rddTraining())

            self.evaluateRddClassificationModel()

            #final model to save
            self._set_rddModel('classification', 'randomForest',
                               self._get_rdd())

        #regression
        else:
            self._set_rddModel('regression', 'randomForest',
                               self._get_rddTraining())

            self.evaluateRddRegressionModel()

            #final model to save
            self._set_rddModel('regression', 'randomForest', self._get_rdd())

        #TODO: save the model
        self.saveRddModel(sparkContext)

    def buildDataFrameModel(self):
        # init dataframe from dataset
        self._set_dataFrame()
        # split into test - training set
        self.splitDataFrameData()

        #vector assembler
        ignore = [self.targetVariable]
        vecAssembler = VectorAssembler(inputCols=[
            x for x in self._dataFrameTraining.columns if x not in ignore
        ],
                                       outputCol="features")

        #dataFrame cross-validation Pipeline with model selection
        if self.targetType == 'regression':
            #build model on the data we pass
            self._set_dataFrameModel('regression', 'randomForest',
                                     self._get_dataFrameTraining(),
                                     vecAssembler)
            #evaluate best model
            self.evaluateDataFrameRegressionModel()
            # save the model
            self.saveDataFrameModel()

        else:
            #build model on the data we pass
            self._set_dataFrameModel('regression', 'randomForest',
                                     self._get_dataFrameTraining(),
                                     vecAssembler)
            #TODO evaluate best model
            self.evaluateDataFrameClassificationModel(sparkContext)
            #TODO save the model
            self.saveDataFrameModel(sparkContext)

    def performModelSelection(self):
        try:
            i = float(self.sparkOptions[4])
            return 0
        except (ValueError, TypeError):
            return 1

    dataFrame = property(_get_dataFrame, _set_dataFrame)
    dataFrameTest = property(_get_dataFrameTest)
    dataFrameTraining = property(_get_dataFrameTraining)
    dataFrameModel = property(_get_dataFrameModel, _set_dataFrameModel)
    pipeline = property(_get_pipeline)
    crossval = property(_get_crossval)
    paramGrid = property(_get_paramGrid)
    regEval = property(_get_regEval)

    rdd = property(_get_rdd, _set_rdd)
    rddTest = property(_get_rddTest)
    rddTraining = property(_get_rddTraining)
    rddModel = property(_get_rddModel, _set_rddModel)
Example #18
0
class BinaryRelevance():
    def __init__(self, featuresCol):
        self.models = []
        self.featuresCol = featuresCol
        self.data = None
        self.label_columns = None
        self.pipeline = None

    def fit(self, train, columns):
        self.label_columns = columns
        for i in columns:
            print(i)
            lr = LogisticRegression(featuresCol=self.featuresCol,
                                    labelCol=i,
                                    predictionCol=i + '_pred',
                                    rawPredictionCol=i + '_rawPrediction',
                                    probabilityCol=i + 'prob')
            model = lr.fit(train)
            self.models.append(model)
        self.pipeline = Pipeline(stages=self.models)
        self.pipeline.fit(train)
        #print(self.models)
    def transform(self, data):
        '''
        for model in self.models:
            data=model.transform(data)
        '''

        from pyspark.ml import Pipeline
        if self.pipeline is None:
            pipe = Pipeline(stages=self.models)
            self.pipeline = pipe
        else:
            pipe = self.pipeline
        data_predicted = pipe.fit(data).transform(data)
        self.data = data_predicted
        return data_predicted

    def save(self, path):
        self.pipeline.save(path)

    def load(self, path):
        self.pipeline = Pipeline.load('pipeline.pkl')

    def find_recommendation(self, user, id_column):
        test = self.data
        predicted = [i for i in test.columns if i.endswith('pred')]
        label_columns = self.label_columns
        dict_val = test.where(test[id_column] == user).select(
            *label_columns).toPandas().to_dict()
        dict_pred = test.where(test[id_column] == user).select(
            *predicted).toPandas().to_dict()
        product_dict = {'current': [], 'recommendation': []}
        for i in zip(dict_val.items(), dict_pred.items()):
            #print(i)
            if int(i[1][1][0]) == 1 and int(i[0][1][0]) == 0:
                product_dict['recommendation'].append(i[0][0])
            if int(i[0][1][0]) == 1:
                product_dict['current'].append(i[0][0])

        return product_dict