Ejemplo n.º 1
0
    def clustering_tuning(self):
        df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}",
                             header=None)

        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()

        df = spark.createDataFrame(df_raw)
        assembler = VectorAssembler(inputCols=df.columns, outputCol="features")
        # df_sample = df.sample(withReplacement=False, fraction=0.1)
        df_vec = assembler.transform(df).select("features")

        K_lst = list(range(50, 401, 10))

        # gmm
        for k in range(K_lst):
            gm = GaussianMixture(k=k, tol=1, seed=10)
            gm.setMaxIter(500)

            model = gm.fit(df_vec)
            model.setPredictionCol("newPrediction")
            transformed = model.transform(df).select("features",
                                                     "newPrediction")

        transformed = transformed.reset_index()
        return transformed
Ejemplo n.º 2
0
    def run(self):

        tf_path = self.settings.tf_path
        algorithm = self.settings.algorithm
        seed = int(self.settings.seed)
        k = int(self.settings.k)
        result_path = self.settings.result_path
        target = self.settings.target

        spark = SparkSession.builder.getOrCreate()

        with open("train_spark.txt", "w") as file:
            file.write("spark context" + str(spark.sparkContext))
            file.write("===SeessionID===")
            file.write(str(id(spark)))

        df = spark.read.option("header", "true") \
            .option("inferSchema", "true") \
            .parquet(tf_path)
        df.repartition(10)

        # MODELING
        if algorithm == 'GMM':
            gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(
                seed)
            print("=====" * 8)
            print(gmm.explainParams())
            print("=====" * 8)
            model = gmm.fit(df)
        elif algorithm == 'KMeans':
            kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
            print("=====" * 8)
            print(kmm.explainParams())
            print("=====" * 8)
            model = kmm.fit(df)
        else:
            raise ValueError("no alg")

        prediction = model.transform(df)

        with open("./feature_info.pickle", "rb") as handle:
            features_info = pickle.load(handle)

        prediction.select(features_info["numeric_features"] +
                          features_info["category_features"] +
                          [target, 'prediction']).coalesce(1).write.mode(
                              'overwrite').csv(result_path, header=True)
        print("Result file is successfully generated at: ", result_path)
Ejemplo n.º 3
0
def gmmresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=20000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    gmm = GaussianMixture(k=8, featuresCol='features')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    results.filter(results.prediction == 1).show(200, False)
    results.show()
    results.toPandas().to_csv(
        'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
def main():
    parser = argparse.ArgumentParser(description='Clustering with pyspark.')

    parser.add_argument('--data-file', type=str, default='enwiki.json')
    parser.add_argument('--num-clusters', type=int, default=4)
    parser.add_argument('--seed', type=int, default=23)
    parser.add_argument('--algorithm',
                        default='kmeans',
                        choices=['kmeans', 'hier', 'gmm'])
    parser.add_argument('--output-groundtruth',
                        type=str,
                        default='groundtruth.csv')
    parser.add_argument('--output-cluster', type=str, default='cluster.csv')

    args = parser.parse_args()

    spark_session = SparkSession.builder.appName('clustering').getOrCreate()

    data = preprocess(spark_session, args.data_file)

    if args.algorithm == 'kmeans':
        alg = KMeans()
    elif args.algorithm == 'hier':
        alg = BisectingKMeans()
    elif args.algorithm == 'gmm':
        alg = GaussianMixture()

    model = train(alg, data, args.num_clusters, seed=args.seed)
    evaluate(data, model, args.algorithm, args.num_clusters,
             args.output_groundtruth, args.output_cluster)
Ejemplo n.º 5
0
def train(df, hiperparameter):
    '''
    Gaussian Mixture training, returning Gaussian Mixture model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    gm = GaussianMixture(featuresCol=hiperparameter['featuresCol'],
                         predictionCol=hiperparameter['predictionCol'],
                         k=hiperparameter['k'],
                         probabilityCol=hiperparameter['probabilityCol'],
                         tol=hiperparameter['tol'],
                         maxIter=hiperparameter['maxIter'],
                         seed=hiperparameter['seed'])
    model = gm.fit(df)
    return model
Ejemplo n.º 6
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ),
             (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
Ejemplo n.º 7
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()
    
    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234)

    gmm = GaussianMixture().setK(2)
    model = gmm.fit(trainingData)
    
    # Make predictions
    predictions = model.transform(testData)

    appendTime(sys.argv,start_computing_time)

    spark.stop()
Ejemplo n.º 9
0
    def fit_ml_model(self, k, sample_fraction=None, retry=True):
        if sample_fraction:
            training_data = self.ml_training_data.sample(
                False, fraction=sample_fraction)
        else:
            training_data = self.ml_training_data

        result = GaussianMixture(
            k=k, maxIter=self.max_iterations).fit(training_data)
        ll = result.summary.logLikelihood

        # Retry to get a valid model if the calculated log likelihood is > 0.
        retries = 0
        while retry and ll > 0 and retries < self.fit_model_retries:
            retry_sample_fraction = sample_fraction or self.ll_sample_fraction
            retry_data = self.ml_training_data.sample(
                False, fraction=retry_sample_fraction)
            result = GaussianMixture(
                k=k, maxIter=self.max_iterations).fit(retry_data)
            ll = result.summary.logLikelihood
            retries += 1

        return result
Ejemplo n.º 10
0
def gaussian_mixture():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    data = [(Vectors.dense([-0.1, -0.05]), ), (Vectors.dense([-0.01, -0.1]), ),
            (Vectors.dense([0.9, 0.8]), ), (Vectors.dense([0.75, 0.935]), ),
            (Vectors.dense([-0.83, -0.68]), ), (Vectors.dense([-0.91,
                                                               -0.76]), )]
    df = spark.createDataFrame(data, ["features"])
    gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10)
    model = gm.fit(df)
    model.hasSummary
    # True
    summary = model.summary
    summary.k
    # 3
    summary.clusterSizes
    # [2, 2, 2]
    weights = model.weights
    len(weights)
    # 3
    model.gaussiansDF.show()
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[4].prediction == rows[5].prediction
    # True
    rows[2].prediction == rows[3].prediction
    # True
    temp_path = "./"
    gmm_path = temp_path + "/gmm"
    gm.save(gmm_path)
    gm2 = GaussianMixture.load(gmm_path)
    gm2.getK()
    # 3
    model_path = temp_path + "/gmm_model"
    model.save(model_path)
    model2 = GaussianMixtureModel.load(model_path)
    model2.hasSummary
    # False
    model2.weights == model.weights
    # True
    model2.gaussiansDF.show()
# In[36]:

dataset = outputFeatureDf
kValues = [2, 3, 4, 5, 6, 7, 8]
bwssse = []
for k in kValues:
    bkmeans = BisectingKMeans().setK(k).setSeed(122)
    bmodel = bkmeans.fit(dataset)
    bwssse.append(bmodel.computeCost(dataset))
for i in bwssse:
    print(i)

# In[31]:

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture(predictionCol="prediction").setK(2).setSeed(538009335)
gmmmodel = gmm.fit(outputFeatureDf)
print("Gaussians shown as a DataFrame: ")
gmmmodel.gaussiansDF.show()

# In[32]:

from sklearn.metrics.cluster import completeness_score
transformed = gmmmodel.transform(dataset)
labels = labeldf.collect()
label_array = [int(i[0]) for i in labels]
preds = transformed.select('prediction').collect()
preds_array = [int(i.prediction) for i in preds]
completeness_score(preds_array, label_array)

# In[51]:
# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
model = gmm.fit(sales)


# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------
Ejemplo n.º 13
0
Archivo: Gaussian.py Proyecto: AAB94/RP
                              withStd=True,
                              withMean=True)

scaled_model = stand_scaled.fit(train_df)

train_df = scaled_model.transform(train_df)

scaled_model = stand_scaled.fit(test1_df)

test1_df = scaled_model.transform(test1_df)

scaled_model = stand_scaled.fit(test2_df)

test2_df = scaled_model.transform(test2_df)

gm = GaussianMixture(featuresCol="features", k=2, seed=2, maxIter=20)

gmodel = gm.fit(train_df)

if gmodel.hasSummary:
    print("Cluster sizes", gmodel.summary.clusterSizes)
    print("Clsuters ", gmodel.summary.k)

test1_df = gmodel.transform(test1_df)
test1_df.select("features", "Occupancy", "prediction").show(5)

test2_df = gmodel.transform(test2_df)
test2_df.select("features", "Occupancy", "prediction").show(5)

count1 = test1_df.filter(" prediction!=Occupancy").count()
total1 = test1_df.count()
Ejemplo n.º 14
0
transformed = lda_model.transform(dataset)
transformed.display()

# COMMAND ----------

# MAGIC %md #####Topic Modeling using Latent Dirichlet Allocation

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture

train_df = spark.read.table("retail_features").selectExpr(
    "selected_features as features")

gmm = GaussianMixture(k=3, featuresCol='features')
gmm_model = gmm.fit(train_df)

gmm_model.gaussiansDF.display()

# COMMAND ----------

# MAGIC %md #### 2. Associan Rules
# MAGIC #####Collaborative Filtering - Alternating Least Squares

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
Ejemplo n.º 15
0
# Integrate features
features_processed = VectorAssembler(inputCols=["category", "numeric"],
                                     outputCol="features")
tot_pipeline = Pipeline(stages=[features_processed])
processed = tot_pipeline.fit(df).transform(df)
processed.write.mode("overwrite").parquet(tf_path)

feature_info = {
    "numeric_features": numeric_features,
    "category_features": category_features
}

# MODELING
if algorithm == 'GMM':
    gmm = GaussianMixture().setK(k).setFeaturesCol("features").setSeed(seed)
    print("=====" * 8)
    print(gmm.explainParams())
    print("=====" * 8)
    model = gmm.fit(processed)
elif algorithm == 'KMeans':
    kmm = KMeans().setK(k).setFeaturesCol("features").setSeed(seed)
    print("=====" * 8)
    print(kmm.explainParams())
    print("=====" * 8)
    model = kmm.fit(processed)
else:
    raise ValueError("no alg")

prediction = model.transform(processed)
bkmModel = bkm.fit(sales)

# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
model = gmm.fit(sales)

# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()

# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
Ejemplo n.º 17
0
q = "SELECT ss.ss_customer_id AS cid, count(CASE WHEN i.i_class_id=1  THEN 1 ELSE NULL END) AS id1,count(CASE WHEN i.i_class_id=3  THEN 1 ELSE NULL END) AS id3,count(CASE WHEN i.i_class_id=5  THEN 1 ELSE NULL END) AS id5,count(CASE WHEN i.i_class_id=7  THEN 1 ELSE NULL END) AS id7, count(CASE WHEN i.i_class_id=9  THEN 1 ELSE NULL END) AS id9,count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15,count(CASE WHEN i.i_class_id=2  THEN 1 ELSE NULL END) AS id2,count(CASE WHEN i.i_class_id=4  THEN 1 ELSE NULL END) AS id4,count(CASE WHEN i.i_class_id=6  THEN 1 ELSE NULL END) AS id6,count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12, count(CASE WHEN i.i_class_id=8  THEN 1 ELSE NULL END) AS id8,count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,count(CASE WHEN i.i_class_id=16 THEN 1 ELSE NULL END) AS id16 FROM store_sales ss INNER JOIN items i ON ss.ss_item_id = i.i_item_id WHERE i.i_category_name IN ('cat#01','cat#02','cat#03','cat#04','cat#05','cat#06','cat#07','cat#08','cat#09','cat#10','cat#11','cat#12','cat#013','cat#14','cat#15') AND ss.ss_customer_id IS NOT NULL GROUP BY ss.ss_customer_id HAVING count(ss.ss_item_id) > 3"

df = spark.sql(q)

assembler = VectorAssembler(inputCols=[
    "cid", "id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9",
    "id10", "id11", "id12", "id13", "id14", "id15", "id16"
],
                            outputCol="FEATURE")

vd = assembler.transform(df)

cost = list()

gmm = GaussianMixture().setK(2).setFeaturesCol('FEATURE').setSeed(
    538009335).setTol(0.01)
model = gmm.fit(vd)

weights = model.weights
print(weights)
summary = model.summary
summary.k
logLikelihood = summary.logLikelihood

param = model.explainParams()

print(param)

model.gaussiansDF.select("mean").head()
model.gaussiansDF.select("cov").head()
model.gaussiansDF.show()
Ejemplo n.º 18
0
# In[21]:


print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show()

# Shows the result
transformed = model.transform(featurized_data)
transformed.show()


# In[22]:


from pyspark.ml.clustering import GaussianMixture



gmm = GaussianMixture().setK(3).setSeed(538009335)
model = gmm.fit(featurized_data)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Ejemplo n.º 19
0
#Lets test again with the best k = 7 on the denormalized dataset

kmeans = KMeans(featuresCol="features").setK(7).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
model = pipeline.fit(df_denormalized)
predictions = model.transform(df_denormalized)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

#Lets try GaussianMixture instead of KMeans
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture(featuresCol="features").setK(14).setSeed(1)
pipeline = Pipeline(stages=[vectorAssembler, gmm])
model = pipeline.fit(df)
predictions = model.transform(df)
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

#Lets try finding the best K using an iterative method
Ks = 15
mean_acc = np.zeros((Ks - 1))
ConfusionMx = []
for n in range(7, Ks):

    #Train Model and Predict
Ejemplo n.º 20
0
def SparkML(train_df,
            test_df=None,
            featuresCol='features',
            labelCol='label',
            binaryclass=False,
            multiclass=False,
            n_cluster=2,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=10,
            userid=3,
            itemid=3,
            itemsCol='items',
            minSupport=0.3,
            minConfidence=0.8,
            stringIndexer=False,
            inputColStringIndexer=None,
            outputColStringIndexer=None,
            oneHotEncoder=False,
            inputColOneHotEncoder=None,
            outputColOneHotEncoder=None,
            vectorAssembler=False,
            inputColsVectorAssembler=None,
            outputColsVectorAssembler=None,
            vectorIndexer=False,
            inputColsVectorIndexer=None,
            outputColsVectorIndexer=None,
            maxCategories=None,
            classification=False,
            logisticregression=False,
            decisiontreeclassifier=False,
            linearsvc=False,
            naivebayes=False,
            randomforestclassifier=False,
            gbtclassifier=False,
            regression=False,
            linearregression=True,
            decisiontreeregressor=False,
            randomforestregressor=False,
            gbtregressor=False,
            clustering=False,
            kmeans=False,
            gaussianmixture=False,
            lda=False,
            recommendation=False,
            als=False,
            association=False,
            fpgrowth=False):
    if classification:
        if logisticregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRClassifier = LogisticRegression(featuresCol=featuresCol,
                                              labelCol=labelCol,
                                              predictionCol='Prediction',
                                              probabilityCol='Probability',
                                              rawPredictionCol='RawPrediction',
                                              standardization=True,
                                              maxIter=100,
                                              regParam=0.0,
                                              elasticNetParam=0.0,
                                              tol=1e-06,
                                              fitIntercept=True,
                                              threshold=0.5)
            paramGrid = ParamGridBuilder().addGrid(
                LRClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            LRCV = CrossValidator(estimator=LRClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LRC_Pipeline = Pipeline(stages=stagesList)
            LRC_PipelineModel = LRC_Pipeline.fit(train_df)
            LRC_Predicted = LRC_PipelineModel.transform(test_df)
            LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel
            LRC_Probability = LRC_Predicted.select("Probability").toPandas()
            LRC_Prediction = LRC_Predicted.select("Prediction").toPandas()
            LRC_Score = evaluator.evaluate(LRC_Predicted)
            return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score
        if decisiontreeclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTClassifier = DecisionTreeClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTClassifier.impurity, ["gini", "entropy"]).addGrid(
                    DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        DTClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            DTCV = CrossValidator(estimator=DTClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(DTCV)
            DTC_Pipeline = Pipeline(stages=stagesList)
            DTC_PipelineModel = DTC_Pipeline.fit(train_df)
            DTC_Predicted = DTC_PipelineModel.transform(test_df)
            DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel
            DTC_Probability = DTC_Predicted.select("Probability").toPandas()
            DTC_Prediction = DTC_Predicted.select("Prediction").toPandas()
            DTC_Score = evaluator.evaluate(DTC_Predicted)
            return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score
        if linearsvc:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            SVClassifier = LinearSVC(featuresCol=featuresCol,
                                     labelCol=labelCol,
                                     predictionCol='Prediction',
                                     rawPredictionCol='RawPrediction',
                                     maxIter=100,
                                     regParam=0.0,
                                     tol=1e-06,
                                     fitIntercept=True,
                                     standardization=True,
                                     threshold=0.0)
            paramGrid = ParamGridBuilder().addGrid(
                SVClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    SVClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            SVCV = CrossValidator(estimator=SVClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(SVCV)
            SVC_Pipeline = Pipeline(stages=stagesList)
            SVC_PipelineModel = SVC_Pipeline.fit(train_df)
            SVC_Predicted = SVC_PipelineModel.transform(test_df)
            SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel
            SVC_Prediction = SVC_Predicted.select("Prediction").toPandas()
            SVC_Score = evaluator.evaluate(SVC_Predicted)
            return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score
        if naivebayes:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            NBClassifier = NaiveBayes(featuresCol=featuresCol,
                                      labelCol=labelCol,
                                      predictionCol='Prediction',
                                      probabilityCol='Probability',
                                      rawPredictionCol='RawPrediction',
                                      smoothing=1.0,
                                      modelType='multinomial',
                                      thresholds=None)
            paramGrid = ParamGridBuilder().addGrid(
                NBClassifier.smoothing,
                [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            NBCV = CrossValidator(estimator=NBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(NBCV)
            NBC_Pipeline = Pipeline(stages=stagesList)
            NBC_PipelineModel = NBC_Pipeline.fit(train_df)
            NBC_Predicted = NBC_PipelineModel.transform(test_df)
            NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel
            NBC_Probability = NBC_Predicted.select("Probability").toPandas()
            NBC_Prediction = NBC_Predicted.select("Prediction").toPandas()
            NBC_Score = evaluator.evaluate(NBC_Predicted)
            return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score
        if randomforestclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFClassifier = RandomForestClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                numTrees=20,
                featureSubsetStrategy='auto',
                seed=None,
                subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                RFClassifier.impurity, ["gini", "entropy"]).addGrid(
                    RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        RFClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).addGrid(
                            RFClassifier.numTrees,
                            [5, 10, 20, 50, 100, 200]).addGrid(
                                RFClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            RFCV = CrossValidator(estimator=RFClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(RFCV)
            RFC_Pipeline = Pipeline(stages=stagesList)
            RFC_PipelineModel = RFC_Pipeline.fit(train_df)
            RFC_Predicted = RFC_PipelineModel.transform(test_df)
            RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel
            RFC_Probability = RFC_Predicted.select("Probability").toPandas()
            RFC_Prediction = RFC_Predicted.select("Prediction").toPandas()
            RFC_Score = evaluator.evaluate(RFC_Predicted)
            return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score
        if gbtclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBClassifier = GBTClassifier(featuresCol=featuresCol,
                                         labelCol=labelCol,
                                         predictionCol='Prediction',
                                         maxDepth=5,
                                         maxBins=32,
                                         minInstancesPerNode=1,
                                         minInfoGain=0.0,
                                         lossType='logistic',
                                         maxIter=20,
                                         stepSize=0.1,
                                         seed=None,
                                         subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBClassifier.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBClassifier.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = MulticlassClassificationEvaluator(
                labelCol=labelCol,
                predictionCol="Prediction",
                metricName="accuracy")
            GBCV = CrossValidator(estimator=GBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GBCV)
            GBC_Pipeline = Pipeline(stages=stagesList)
            GBC_PipelineModel = GBC_Pipeline.fit(train_df)
            GBC_Predicted = GBC_PipelineModel.transform(test_df)
            GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel
            GBC_Prediction = GBC_Predicted.select("Prediction").toPandas()
            GBC_Score = evaluator.evaluate(GBC_Predicted)
            return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score
    if regression:
        if linearregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRegressor = LinearRegression(featuresCol=featuresCol,
                                          labelCol=labelCol,
                                          predictionCol='Prediction',
                                          standardization=True,
                                          fitIntercept=True,
                                          loss='squaredError',
                                          maxIter=100,
                                          regParam=0.0,
                                          elasticNetParam=0.0,
                                          tol=1e-06,
                                          epsilon=1.35)
            paramGrid = ParamGridBuilder().addGrid(
                LRegressor.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRegressor.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            LRCV = CrossValidator(estimator=LRegressor,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LR_Pipeline = Pipeline(stages=stagesList)
            LR_PipelineModel = LR_Pipeline.fit(train_df)
            LR_Predicted = LR_PipelineModel.transform(test_df)
            LR_BestModel = LR_PipelineModel.stages[-1].bestModel
            LR_Prediction = LR_Predicted.select("Prediction").toPandas()
            LR_Score = evaluator.evaluate(LR_Predicted)
            return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score
        if decisiontreeregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                seed=None,
                                                varianceCol=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            DTRCV = CrossValidator(estimator=DTRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(DTRCV)
            DTR_Pipeline = Pipeline(stages=stagesList)
            DTR_PipelineModel = DTR_Pipeline.fit(train_df)
            DTR_Predicted = DTR_PipelineModel.transform(test_df)
            DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel
            DTR_Prediction = DTR_Predicted.select("Prediction").toPandas()
            DTR_Score = evaluator.evaluate(DTR_Predicted)
            return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score
        if randomforestregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFRegressor = RandomForestRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                subsamplingRate=1.0,
                                                seed=None,
                                                numTrees=20)
            paramGrid = ParamGridBuilder().addGrid(
                RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        RFRegressor.numTrees,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            RFRegressor.subsamplingRate,
                            [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            RFRCV = CrossValidator(estimator=RFRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(RFRCV)
            RFR_Pipeline = Pipeline(stages=stagesList)
            RFR_PipelineModel = RFR_Pipeline.fit(train_df)
            RFR_Predicted = RFR_PipelineModel.transform(test_df)
            RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel
            RFR_Prediction = RFR_Predicted.select("Prediction").toPandas()
            RFR_Score = evaluator.evaluate(RFR_Predicted)
            return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score
        if gbtregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBRegressor = GBTRegressor(featuresCol=featuresCol,
                                       labelCol=labelCol,
                                       predictionCol='Prediction',
                                       maxDepth=5,
                                       maxBins=32,
                                       minInstancesPerNode=1,
                                       minInfoGain=0.0,
                                       subsamplingRate=1.0,
                                       lossType='squared',
                                       maxIter=20,
                                       stepSize=0.1,
                                       seed=None,
                                       impurity='variance')
            paramGrid = ParamGridBuilder().addGrid(
                GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBRegressor.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBRegressor.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBRegressor.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            GBRCV = CrossValidator(estimator=GBRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(GBRCV)
            GBR_Pipeline = Pipeline(stages=stagesList)
            GBR_PipelineModel = GBR_Pipeline.fit(train_df)
            GBR_Predicted = GBR_PipelineModel.transform(test_df)
            GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel
            GBR_Prediction = GBR_Predicted.select("Prediction").toPandas()
            GBR_Score = evaluator.evaluate(GBR_Predicted)
            return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score
    if clustering:
        if kmeans:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            KCluster = KMeans(featuresCol=featuresCol,
                              predictionCol='Prediction',
                              k=n_cluster,
                              initMode='k-means||',
                              initSteps=2,
                              tol=0.0001,
                              maxIter=20,
                              seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid(
                    KCluster.maxIter,
                    [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                        KCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            KMCV = CrossValidator(estimator=KCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(KMCV)
            KMC_Pipeline = Pipeline(stages=stagesList)
            KMC_PipelineModel = KMC_Pipeline.fit(train_df)
            KMC_Predicted = KMC_PipelineModel.transform(train_df)
            KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel
            KMC_Prediction = KMC_Predicted.select("Prediction").toPandas()
            KMC_Score = evaluator.evaluate(KMC_Predicted)
            return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score
        if gaussianmixture:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GMCluster = GaussianMixture(featuresCol=featuresCol,
                                        predictionCol='Prediction',
                                        probabilityCol='Probability',
                                        k=n_cluster,
                                        tol=0.01,
                                        maxIter=100,
                                        seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                GMCluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    GMCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            GMCV = CrossValidator(estimator=GMCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GMCV)
            GMC_Pipeline = Pipeline(stages=stagesList)
            GMC_PipelineModel = GMC_Pipeline.fit(train_df)
            GMC_Predicted = GMC_PipelineModel.transform(train_df)
            GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel
            GMC_Probability = GMC_Predicted.select("Probability").toPandas()
            GMC_Prediction = GMC_Predicted.select("Prediction").toPandas()
            GMC_Score = evaluator.evaluate(GMC_Predicted)
            return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score
        if lda:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LDACluster = LDA(featuresCol=featuresCol,
                             maxIter=20,
                             seed=None,
                             k=n_cluster,
                             learningOffset=1024.0,
                             learningDecay=0.51,
                             subsamplingRate=0.05)
            paramGrid = ParamGridBuilder().addGrid(
                LDACluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    LDACluster.seed, [i for i in range(1001)]).addGrid(
                        LDACluster.subsamplingRate,
                        [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            LDACV = CrossValidator(estimator=LDACluster,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(LDACV)
            LDA_Pipeline = Pipeline(stages=stagesList)
            LDA_PipelineModel = LDA_Pipeline.fit(train_df)
            LDA_Predicted = LDA_PipelineModel.transform(train_df)
            LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel
            LDA_Topics = LDA_BestModel.describeTopics().toPandas()
            LDA_Score = evaluator.evaluate(LDA_Predicted)
            return LDA_BestModel, LDA_Topics, LDA_Score
    if recommendation:
        if als:
            ALSR = ALS(userCol=userCol,
                       itemCol=itemCol,
                       ratingCol=ratingCol,
                       rank=rank,
                       maxIter=10,
                       regParam=0.1,
                       numUserBlocks=10,
                       numItemBlocks=10,
                       alpha=1.0,
                       seed=1)
            ALSR_Model = ALSR.fit(train_df)
            ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid)
            ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid)
            return ALSR_Model, ALSR_ForUsers, ALSR_ForItems
    if association:
        if fpgrowth:
            fpg = FPGrowth(minSupport=minSupport,
                           minConfidence=minConfidence,
                           itemsCol=itemsCol,
                           predictionCol='Prediction')
            fpg_model = fpg.fit(train_df)
            fpg_freqItemsets = fpg_model.freqItemsets.toPandas()
            fpg_associationRules = fpg_model.associationRules.toPandas()
            return fpg_model, fpg_freqItemsets, fpg_associationRules
times = []
for i in range(1, 5):
    start = time.time()
    bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1)))
    modelBkm = bkm.fit(tsneDataFrame.select("features"))
    transformedBkm = modelBkm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
bisectingKmeansTime = average(times)

##############       GMM      #################
from pyspark.ml.clustering import GaussianMixture
times = []
for i in range(1, 5):
    start = time.time()
    gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1)))
    modelGmm = gmm.fit(tsneDataFrame.select("features"))
    transformedGmm = modelGmm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
gmmTime = average(times)

#preparation of data for non pyspark implementations
clusterData = tsneDataFrame.select("screen_name", "features").collect()
screenames = [x[0] for x in clusterData]
clData = [x[1] for x in clusterData]
clusData = np.array(clData)
x = [cl[0] for cl in clData]
y = [cl[1] for cl in clData]

###################   DBSCAN   ###################
# longitude are already in the same scale, we can proceed.

# Select home latitude and longitude as the features:
selected = ["home_lat", "home_lon"]

# Assemble the feature vector:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=selected, outputCol="features")
assembled = assembler.transform(students)


# ## Fit a Gaussian mixture model

# Specify a Gaussian mixture model with two clusters:
from pyspark.ml.clustering import GaussianMixture
gm = GaussianMixture(featuresCol="features", k=2, seed=12345)

# Examine the hyperparameters:
print(gm.explainParams())

# Fit the Gaussian mixture model:
gmm = gm.fit(assembled)
type(gmm)


# ## Examine the Gaussian mixture model

# Examine the mixing weights:
gmm.weights

# Examine the (multivariate) Gaussian distributions:
Ejemplo n.º 23
0
# negative_udf = udf(lambda x: tp_values(x, 1))
# #
#
# #
# train_df = train_df.withColumn('pos', positive_udf(col('ST')).astype(IntegerType())) \
#     .withColumn('neg', negative_udf(col('ST')).astype(IntegerType()))
#
# train_df.show()
#
# assembler = VectorAssembler(inputCols=['pos', 'neg'], outputCol='features')
# train_df = assembler.transform(train_df)
# train_df.show()

#modelling

kmeans = KMeans().setK(2).setSeed(1).setMaxIter(20)
model = kmeans.fit(train_df)
model.transform(test_df).show()
for c in model.clusterCenters():
    print(c)
#

bkmeans = BisectingKMeans().setK(2).setSeed(1).setMaxIter(20)
model = bkmeans.fit(train_df)
model.transform(test_df).show()
for c in model.clusterCenters():
    print(c)

gaussianmixture = GaussianMixture().setK(2).setSeed(1)
model = gaussianmixture.fit(train_df)
model.transform(test_df).show()
Ejemplo n.º 24
0
k = args.k_clusters

if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']:
    raise ValueError('Not a valid algorithm')

ss = SparkSession.builder.getOrCreate()

df = ss.read.csv(path, header=True, inferSchema=True)

df_preprocessed = preprocessing(df, num_pca=num_pca_features)

df_preprocessed.write.parquet("preprocessed", mode="Overwrite")

if algorithm == 'kmeans':
    model = KMeans(k=k).setSeed(1).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'spectral':
    model = SpectralClustering(k=k, k_nearest=7)
    predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes)
elif algorithm == 'lda':
    model = LDA(k=k, maxIter=10).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'gmm':
    model = GaussianMixture(k=k).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)


predictions.select([col for col in predictions.columns if col != 'features'])\
           .toPandas()\
           .to_csv(sys.stdout)
Ejemplo n.º 25
0
obj = client.get_object(Bucket='yelpdatacf', Key='book_cf.csv')
df = pd.read_csv(obj['Body'])

df.rating = (df.rating - df.rating.mean())
ratings = spark.createDataFrame(df)

# use the model that has min RMSE
num_iter,param = 200,0.2
als = ALS(maxIter=num_iter, regParam=param, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(ratings)

user_feature = model.userFactors
book_feature = model.itemFactors

k = 20
gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features")
model = gmm.fit(user_feature)
transformed = model.transform(user_feature).select('id', 'prediction')
rows = transformed.collect()
df = spark.createDataFrame(rows)


df.write.jdbc(url='jdbc:%s' % url+'yelp',
        table='book_gm_user_feature20', mode='overwrite',  properties=properties)


k = 20
gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features")
model = gmm.fit(book_feature)
transformed = model.transform(book_feature).select('id', 'prediction')
rows = transformed.collect()
Ejemplo n.º 26
0
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$
from pyspark.sql import SparkSession

"""
A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("GaussianMixtureExample")\
        .getOrCreate()

    # $example on$
    # loads data
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    gmm = GaussianMixture().setK(2).setSeed(538009335)
    model = gmm.fit(dataset)

    print("Gaussians shown as a DataFrame: ")
    model.gaussiansDF.show(truncate=False)
    # $example off$

    spark.stop()
Ejemplo n.º 27
0
with Timer('read', 'Reading data'):
    df = df_base = spark.read.csv('data/yellow_tripdata_2016-01.csv',
                                  header=True,
                                  inferSchema=True)

with Timer('process', 'Cleaning invalid data'):
    df = process(df)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark import StorageLevel

from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(k=1000)

result = []
with Timer('clustering', 'Computing clusters'):
    for weekday in range(7):
        for hour in range(24):
            with Timer('clustering',
                       'Computing clusters for {}x{}'.format(weekday, hour)):
                df_h = df.filter(df.weekday == weekday).filter(df.hour == hour)
                va = VectorAssembler(
                    inputCols=["pickup_latitude", "pickup_longitude"],
                    outputCol="features")
                df_t = va.transform(df_h)

                model = gmm.fit(df_t)
Ejemplo n.º 28
0
# In[84]:

from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=["X", "Y", "Z"],
                                  outputCol="features")

# Please insatiate a clustering algorithm from the SparkML package and assign it to the clust variable. Here we don’t need to take care of the “CLASS” column since we are in unsupervised learning mode – so let’s pretend to not even have the “CLASS” column for now – but it will become very handy later in assessing the clustering performance. PLEASE NOTE – IN REAL-WORLD SCENARIOS THERE IS NO CLASS COLUMN – THEREFORE YOU CAN’T ASSESS CLASSIFICATION PERFORMANCE USING THIS COLUMN
#
#

# In[85]:

from pyspark.ml.clustering import GaussianMixture

clust = GaussianMixture().setK(2).setSeed(1)

# Let’s train...
#

# In[86]:

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vectorAssembler, clust])
model = pipeline.fit(df)

# ...and evaluate...

# In[87]:
Ejemplo n.º 29
0
from pyspark.ml.clustering import GaussianMixture

g = sns.lmplot(x='X Coordinate', y='Y Coordinate', hue='Primary Type', data=crime_df,
               fit_reg=False, size=10, palette={'NARCOTICS': 'tomato', 'THEFT': 'skyblue'})

# for each type of crime
for crime_type, colour in [('NARCOTICS', 'r'), ('THEFT', 'b')]:
    crime_subset = (
        crime_with_features_sdf
        .filter(sf.col('Primary Type') == crime_type)
    )

    # fit a GMM
    gmm = GaussianMixture(k=30)
    model = gmm.fit(crime_subset)

    # extract the centers of the gaussians
    centers = (
        model
        .gaussiansDF
        .toPandas()
    )

    # Put the transformed data in a variable below
    crimes_with_predictions = model.transform(crime_subset)

    # 2.
    # Write code here
    ranked_gaussians = (
        crimes_with_predictions
        .withColumn('probability', get_probability_udf('probability'))
Ejemplo n.º 30
0
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$
from pyspark.sql import SparkSession
"""
A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("GaussianMixtureExample")\
        .getOrCreate()

    # $example on$
    # loads data
    dataset = spark.read.format("libsvm").load(
        "data/mllib/sample_kmeans_data.txt")

    gmm = GaussianMixture().setK(2).setSeed(538009335)
    model = gmm.fit(dataset)

    print("Gaussians shown as a DataFrame: ")
    model.gaussiansDF.show(truncate=False)
    # $example off$

    spark.stop()
Ejemplo n.º 31
0
# $example on$
from pyspark.ml.clustering import GaussianMixture
# $example off$
from pyspark.sql import SparkSession

"""
A simple example demonstrating Gaussian Mixture Model (GMM).
Run with:
  bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonGuassianMixtureExample")\
        .getOrCreate()

    # $example on$
    # loads data
    dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")

    gmm = GaussianMixture().setK(2)
    model = gmm.fit(dataset)

    print("Gaussians: ")
    model.gaussiansDF.show()
    # $example off$

    spark.stop()
 async def gaussian_mixture(self, df):
     return GaussianMixture().fit(df.select('features'))
Ejemplo n.º 33
0
    # Load the data stored in LIBSVM format as a DataFrame.
    data = sparkConf \
        .read \
        .format("libsvm") \
        .load("data\data_libsvm.txt")
    data.show()

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    numTraining = trainingData.count()
    numTest = testData.count()
    print("numTraining = ",numTraining, " numTest =", numTest)

    # Train a Latent Dirichlet allocation.
    gmm = GaussianMixture(k=200, tol=0.0001,maxIter=10, seed=1)

    model=gmm.fit(trainingData)

    if model.hasSummary:
        summary=model.summary
        print("k=",summary.k)
        print("cluster sizes=",summary.clusterSizes)
        print("logLikelihood=",summary.logLikelihood)
        print("len weights=",len(model.weights))


    # Make predictions.
    predictions = model.transform(testData)
    predictions.show(5, truncate=False)