def __pipeline(self, modeling_code: str, classifiers_metadata: dict,
                   database_url_training: str, database_url_test: str) -> None:

        (features_training, features_testing, features_evaluation) = \
            self.__modeling_code_processing(
                modeling_code,
                self.__spark_session,
                database_url_training,
                database_url_test)

        classifier_switcher = {
            "LR": LogisticRegression(),
            "DT": DecisionTreeClassifier(),
            "RF": RandomForestClassifier(),
            "GB": GBTClassifier(),
            "NB": NaiveBayes(),
        }
        classifier_threads = []

        for name, metadata in classifiers_metadata.items():
            classifier = classifier_switcher[name]
            classifier_threads.append(
                self.__thread_pool.submit(
                    self.__classifier_processing,
                    classifier,
                    features_training,
                    features_testing,
                    features_evaluation,
                    metadata,
                ))

        for classifier in classifier_threads:
            testing_prediction, metadata_document = classifier.result()
            self.__save_classifier_result(testing_prediction,
                                          metadata_document)
Example #2
0
def test_sklearn_decision_tree_multiclass():
    import shap
    from sklearn.tree import DecisionTreeClassifier
    import numpy as np

    X, y = shap.datasets.iris()
    y[y == 2] = 1
    model = DecisionTreeClassifier(max_depth=None,
                                   min_samples_split=2,
                                   random_state=0)
    model.fit(X, y)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    assert np.abs(shap_values[0][0, 0] - 0.05) < 1e-1
    assert np.abs(shap_values[1][0, 0] + 0.05) < 1e-1
Example #3
0
def entrenamiento(df):
	# Vectorizo
	df = df.select("Finishing", "ShortPassing", "BallControl", "Stamina", "SlidingTackle", "GKReflexes", "Crossing", "Agility", "Position", "Dribbling", "SprintSpeed")
	assembler = VectorAssembler(
		inputCols=["Finishing", "ShortPassing", "BallControl", "Stamina", "SlidingTackle", "GKReflexes", "Crossing", "Agility", "Dribbling", "SprintSpeed"],
		outputCol="features")
	df = assembler.transform(df)

	# Dividir nuestro dataset
	(training_df, test_df) = df.randomSplit([0.7, 0.3])

	# Entrenamiento
	entrenador = DecisionTreeClassifier(
		labelCol="Position", 
		featuresCol="features")

	# Creacion de pipeline
	pipeline = Pipeline(stages=[entrenador])
    # Se entrena el modelo
	model = pipeline.fit(training_df)

	# Prediccion
	predictions_df = model.transform(test_df)

	# Evaluador --> Accuracy
	evaluator = MulticlassClassificationEvaluator(
		labelCol="Position",
		predictionCol="prediction",
		metricName="accuracy")

	# Exactitud
	exactitud = evaluator.evaluate(predictions_df)
	print("Exactitud: {}".format(exactitud))
def get_model(model_string='LogisticRegression'):
    """
    Get the desired model object for training and classification

    Args:
    Returns:
    model object from pyspark.ml.classification
    """
    models_dict = {
        'LogisticRegression':
        LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0),
        'DecisionTreeClassifier':
        DecisionTreeClassifier(),
        'RandomForestClassifier':
        RandomForestClassifier(numTrees=10),
        # Deep Learning note: the number on neurons in the last layer needs to equal
        # the number of categories. The number of neurons in the first layer
        # needs to be equal to the vocabulary of count vectorizer
        'MultilayerPerceptronClassifier':
        MultilayerPerceptronClassifier(tol=1e-3,
                                       maxIter=10000,
                                       layers=[500, 100, 20, 6],
                                       blockSize=128,
                                       seed=1234),
        'NaiveBayes':
        NaiveBayes()
    }
    return models_dict[model_string]
Example #5
0
def get_multi_classification_pipeline():

    transformer = AddHasText()

    stringIndexer = StringIndexer(
        inputCol='subbreddit_display_name',
        outputCol='label'
    )

    assembler = VectorAssembler(
        inputCols=[
            "post_title_embedding",
            "comments_number",
            "nsfw",
            "spoiler",
            "up_votes_number",
            "has_text"
        ],
        outputCol="features"
    )

    dt = DecisionTreeClassifier(
        labelCol='label',
        featuresCol='features'
    )

    return Pipeline(
        stages=[
            transformer,
            stringIndexer,
            assembler,
            dt
        ]
    )
Example #6
0
def DecisionTree():
    IrisData = spark.sparkContext.textFile("file:///home/unbroken/MyFiles/Work/Programming/Spark/DecisionTree/Iris.txt")\
    .map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    IrisData.createOrReplaceTempView("iris")
    df = spark.sql("select * from iris")
    labelIndexer = StringIndexer(inputCol='label',
                                 outputCol='labelIndex').fit(IrisData)
    featureIndexer = VectorIndexer(
        inputCol='feature',
        outputCol='indexFeature').setMaxCategories(4).fit(IrisData)
    labelConverter = IndexToString(inputCol='prediction',
                                   outputCol='predictionLabel').setLabels(
                                       labelIndexer.labels)
    trainningData, testingData = IrisData.randomSplit([0.7, 0.3])
    dtClassifier = DecisionTreeClassifier().setLabelCol(
        'labelIndex').setFeaturesCol('indexFeature')
    pipelineClassifier = Pipeline().setStages(
        [labelIndexer, featureIndexer, dtClassifier, labelConverter])
    modelClassifier = pipelineClassifier.fit(trainningData)
    prediction = modelClassifier.transform(testingData)
    print(prediction.show())

    evaluator = MulticlassClassificationEvaluator().setLabelCol(
        'labelIndex').setPredictionCol('prediction').setMetricName("accuracy")
    accuracy = evaluator.evaluate(prediction)
    print(accuracy)

    treeModelClassifier = modelClassifier.stages[2]
    print("Learned classification tree model:\n" +
          str(treeModelClassifier.toDebugString))
Example #7
0
def trainBinaryTreeModel(data, directory=""):
    tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
    remover = StopWordsRemover().setInputCol("words").setOutputCol(
        "filtered").setCaseSensitive(False)
    hashingTF = HashingTF().setNumFeatures(1000).setInputCol(
        "filtered").setOutputCol("rawFeatures")
    idf = IDF().setInputCol("rawFeatures").setOutputCol(
        "features").setMinDocFreq(0)
    dt = DecisionTreeClassifier(labelCol="label",
                                maxDepth=30,
                                featuresCol="features")
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, dt])

    paramGrid = ParamGridBuilder()\
              .addGrid(dt.maxDepth, [2, 5, 10, 20, 30]) \
              .addGrid(dt.maxBins, [10, 50, 80]) \
              .build()

    crossval = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator().setMetricName(
            'areaUnderPR'
        ),  # set area Under precision-recall curve as the evaluation metric
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    cvModel = crossval.fit(data)
    modelName = directory + "BinaryTreeModel"
    cvModel.bestModel.write().overwrite().save(modelName)

    return modelName
Example #8
0
def trainAndEvalModelByDecisionTreeClassifier(stages, train_df, test_df,
                                              evaluator):
    '''
    使用 DecisionTreeClassifier 决策树分类建立机器学习Pipeline流程进行模型训练和评估
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeClassifier(labelCol='label',
                                featuresCol='features',
                                maxDepth=5,
                                maxBins=20)
    dtPipeline = Pipeline(stages=stages +
                          [dt])  # print(str(dtPipeline.getStages()))
    dtPipelineModel = dtPipeline.fit(train_df)
    bestModel = dtPipelineModel.stages[1]  # print(bestModel.toDebugString)
    print(
        '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = dtPipelineModel.transform(test_df)
    # print(str(predicts.columns))  # 预测后新增的字段:'rawPrediction', 'probability', 'prediction'
    # predicts.select('probability', 'prediction').show(10)
    accuracy = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeClassifier 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(accuracy='
        + str(accuracy) + ') =======================')
    return (bestModel, predicts, accuracy)
 def main(self, sc, *args):
     from pyspark.sql.session import SparkSession
     from pyspark.ml import Pipeline
     from pyspark.ml.feature import HashingTF, Tokenizer
     from pyspark.ml.classification import DecisionTreeClassifier
     
     # Initialisiere den SQLContext
     sql = SparkSession.builder\
         .enableHiveSupport() \
         .config("hive.exec.dynamic.partition", "true") \
         .config("hive.exec.dynamic.partition.mode", "nonstrict") \
         .config("hive.exec.max.dynamic.partitions", "4096") \
         .getOrCreate()
     
     # Lade die bereinigten Daten
     df = sql.read.format("com.databricks.spark.csv") \
         .option("header", "true") \
         .option("delimiter", ";") \
         .load(self.input().path)
     
     # Den Klassifikator trainieren
     labeled = df.withColumn("label", df.subreddit.like("datascience").cast("double"))
     train_set, test_set = labeled.randomSplit([0.8, 0.2])
     tokenizer = Tokenizer().setInputCol("cleaned_words").setOutputCol("tokenized")
     hashing = HashingTF().setNumFeatures(1000).setInputCol("tokenized").setOutputCol("features")
     decision_tree = DecisionTreeClassifier()
     pipeline = Pipeline(stages=[tokenizer, hashing, decision_tree])
     model = pipeline.fit(train_set)
     model.save(self.output().path)
def evaluateDecisionTree(trainDF ,testDF):
  """
   Traning by  decision tree classifiers  with some maxDepth 
   params
   it returns bechmarkdata list for  maxDepth params
  """
 
    benchmarkData = []
    for maxDepth in [10.0,15.0]:
      classifier = DecisionTreeClassifier(maxDepth=maxDepth)
      model = classifier.fit(trainDF)
      predictions = model.transform(testDF)
      print("Decision Tree evaluation with maxtDepth : {}".format(maxDepth))
      accuracy = printevaluatation(model, predictions)
      benchmarkData += [("DT", "maxDepth" ,maxDepth ,float(accuracy))]
    return benchmarkData
def DTree_with_maxFeatures_maxDepth_fixed(data, max_depth, max_features):
    gt0 = time()
    crossed['DTree:depth' + repr(max_depth) + ':features' +
            repr(max_features)] = []
    classifier = DecisionTreeClassifier(maxDepth=md,
                                        maxBins=mf,
                                        impurity='gini',
                                        maxMemoryInMB=1024)
    model = classifier.fit(data['scaled_train_df'])
    predictions = model.transform(data['scaled_cv_df'])
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    metric = evaluator.evaluate(predictions)
    crossed['DTree:depth' + repr(max_depth) + ':features' +
            repr(max_features)].append([metric, time() - gt0])
    return crossed
def decision_tree_classifier(trainingDataFrame,
                             maxCategories=4,
                             maxDepth=5,
                             maxBins=32,
                             minInstancesPerNode=1,
                             minInfoGain=0.0,
                             maxMemoryInMB=256,
                             cacheNodeIds=False,
                             checkpointInterval=10,
                             impurity="gini",
                             seed=None):
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel"). \
                   setHandleInvalid("keep").fit(trainingDataFrame)
    featureIndexer = VectorIndexer(
        inputCol="features",
        outputCol="indexedFeatures",
        maxCategories=maxCategories).fit(trainingDataFrame)
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                maxDepth=maxDepth,
                                maxBins=maxBins,
                                minInstancesPerNode=minInstancesPerNode,
                                minInfoGain=minInfoGain,
                                maxMemoryInMB=maxMemoryInMB,
                                cacheNodeIds=cacheNodeIds,
                                checkpointInterval=checkpointInterval,
                                impurity=impurity,
                                seed=seed)
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    dtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = dtModel
    result["summary"] = dtModel.stages[2]
    return result
Example #13
0
def entrenamiento(df):
    # Vectorizo
    df = df.select("EDAD", "GENERO", "ETNIA", "GLICEMIA",
                   "PERIMETRO_ABDOMINAL", "RCV_GLOBAL", "IMC", "DIABETES")
    assembler = VectorAssembler(inputCols=[
        "EDAD", "GENERO", "ETNIA", "GLICEMIA", "PERIMETRO_ABDOMINAL",
        "RCV_GLOBAL", "IMC"
    ],
                                outputCol="features")
    df = assembler.transform(df)

    # Dividir dataset
    (training_df, test_df, validation_df) = df.randomSplit([0.7, 0.2, 0.1])

    # Entrenamiento
    entrenador = DecisionTreeClassifier(labelCol="DIABETES",
                                        featuresCol="features")

    # Creacion de pipeline
    pipeline = Pipeline(stages=[entrenador])
    # Se entrena el modelo
    model = pipeline.fit(training_df)

    # Prediccion
    predictions_df = model.transform(test_df)
    predictions_df = model.transform(validation_df)

    # Evaluador --> Accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="DIABETES",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

    # Exactitud
    exactitud = evaluator.evaluate(predictions_df)
    print("Exactitud: {}".format(exactitud))
Example #14
0
def training(df):
    # 0. load the cleanning data
    df_cleanning = df.select("id").distinct()
    # Split the data into training and test sets (30% held out for testing)
    (df_training, df_test) = df_cleanning.randomSplit([0.7, 0.3])

    # 1. load the training data
    # 准备训练集合
    df_result = df
    df_result = df_result.select("id", "label", "features")
    labelIndexer = StringIndexer(inputCol="label",
                                 outputCol="indexedLabel").fit(df_result)
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=6).fit(df_result)

    df_training.show(10)
    # 1.1 构建训练集合
    df_training = df_training.join(df_result, how="left", on="id")
    df_training.show()
    print(df_training.count())

    # 1.2 构建测试集合
    df_test = df_test.join(df_result, how="left", on="id")
    df_test.show()
    print(df_test.count())

    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures")

    # Chain indexers and tree in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(df_training)

    # Make predictions.
    df_predictions = model.transform(df_test)

    # Select example rows to display.
    df_predictions.show(10)
    df_predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(df_predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    # summary only
    print(treeModel)
    model.write().overwrite().save(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/zyyin/pfizer_model/0.0.4/model_without_prod"
    )
    print(treeModel.toDebugString)

    return treeModel
Example #15
0
 def test_decisiontree_classifier(self):
     dt = DecisionTreeClassifier(maxDepth=1)
     path = tempfile.mkdtemp()
     dtc_path = path + "/dtc"
     dt.save(dtc_path)
     dt2 = DecisionTreeClassifier.load(dtc_path)
     self.assertEqual(dt2.uid, dt2.maxDepth.parent,
                      "Loaded DecisionTreeClassifier instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (dt2.uid, dt2.maxDepth.parent))
     self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
                      "Loaded DecisionTreeClassifier instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Example #16
0
    def train_model(self, train_df, assembler):
        dt = DecisionTreeClassifier(labelCol="label",
                                    featuresCol="features",
                                    seed=self.RANDOM_SEED)
        pipeline = Pipeline(stages=[assembler, dt])

        model = pipeline.fit(train_df)
        return model
    def trainModel(self, sentimentInfoData):
        label = sentimentInfoData.get(pc.INDEXEDCOLM)
        feature = sentimentInfoData.get(pc.FEATURECOLUMN)
        dataset = sentimentInfoData.get(pc.DATASET)
        '''temp split the dataset to training and testing dataset'''
        (trainDataset, testDataset) = dataset.randomSplit([0.7,0.3])
        decisionTreeClassifier = DecisionTreeClassifier(labelCol= label, featuresCol=feature)
        decisionModel = decisionTreeClassifier.fit(trainDataset)
        # decisionModel.transform(trainDataset).groupBy("sentiment").count().show()
        predictionDataset = decisionModel.transform(testDataset)
        #calculating the accuracy of the model
        evaluator = MulticlassClassificationEvaluator(
            labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictionDataset)
        print("Test Error = %g " % (1.0 - accuracy))

        '''gbt = GBTClassifier(labelCol= label, featuresCol= feature).fit(trainDataset)
def main(spark,filename):
  df = spark.read.csv(filename,header=False,inferSchema=True)
  vector_assembler = VectorAssembler(inputCols=['_c0','_c1','_c2','_c3'],outputCol='features')
# df.show(4)
# +---+---+---+---+-----------+
# |_c0|_c1|_c2|_c3|        _c4|
# +---+---+---+---+-----------+
# |5.1|3.5|1.4|0.2|Iris-setosa|
# |4.9|3.0|1.4|0.2|Iris-setosa|
# |4.7|3.2|1.3|0.2|Iris-setosa|
# |4.6|3.1|1.5|0.2|Iris-setosa|
# +---+---+---+---+-----------+
  vector_assembler = VectorAssembler(inputCols=['_c0','_c1','_c2','_c3'],outputCol='features')
  v_df = vector_assembler.transform(df)

# v_df.show(4)
# +---+---+---+---+-----------+-----------------+
# |_c0|_c1|_c2|_c3|        _c4|         features|
# +---+---+---+---+-----------+-----------------+
# |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
# |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
# |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
# |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
# +---+---+---+---+-----------+-----------------+
# only showing top 4 rows
  indexer = StringIndexer(inputCol='_c4',outputCol='label')
  i_df = indexer.fit(v_df).transform(v_df)
#   i_df.show(4)
# +---+---+---+---+-----------+-----------------+-----+
# |_c0|_c1|_c2|_c3|        _c4|         features|label|
# +---+---+---+---+-----------+-----------------+-----+
# |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
# |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
# |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
# |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
# +---+---+---+---+-----------+-----------------+-----+
# only showing top 4 rows
  splits = i_df.randomSplit([0.6,0.4],1)
  train_df =  splits[0]
  test_df = splits[1]
  dt = DecisionTreeClassifier(labelCol= 'label',featuresCol='features')
  dt_model = dt.fit(train_df)
  dt_pred = dt_model.transform(test_df)
  dt_evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
  dt_accuracy =  dt_evaluator.evaluate(dt_pred)
  print(dt_accuracy)
Example #19
0
 def test_decisiontree_classifier(self):
     dt = DecisionTreeClassifier(maxDepth=1)
     path = tempfile.mkdtemp()
     dtc_path = path + "/dtc"
     dt.save(dtc_path)
     dt2 = DecisionTreeClassifier.load(dtc_path)
     self.assertEqual(dt2.uid, dt2.maxDepth.parent,
                      "Loaded DecisionTreeClassifier instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (dt2.uid, dt2.maxDepth.parent))
     self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
                      "Loaded DecisionTreeClassifier instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Example #20
0
def train(data, max_depth, max_bins):
    print("Parameters: max_depth: {}  max_bins: {}".format(
        max_depth, max_bins))
    #     spark = SparkSession.builder.appName("DecisionTreeClassificationExample").getOrCreate()

    # Load the data stored in LIBSVM format as a DataFrame.
    #     data = spark.read.format("libsvm").load(os.environ['DSX_PROJECT_DIR']+data_path)

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    label_indexer = StringIndexer(inputCol="label",
                                  outputCol="indexedLabel").fit(data)

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    feature_indexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=4).fit(data)

    # Split the data into training and test sets
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_bins", max_bins)
    # Train a DecisionTree model.
    dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                maxDepth=max_depth,
                                maxBins=max_bins)

    # Chain indexers and tree in a Pipeline.
    pipeline = Pipeline(stages=[label_indexer, feature_indexer, dt])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and compute test error.
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    test_error = 1.0 - accuracy
    print("Test Error = {} ".format(test_error))

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("test_error", test_error)

    tree_model = model.stages[2]
    print(tree_model)

    mlflow.spark.log_model(model, '')

    spark.stop()
    def pipeline(self, modeling_code, classifiers_metadata):
        spark_session = (
            SparkSession
                .builder
                .appName("modelBuilder")
                .config("spark.driver.port", os.environ[SPARK_DRIVER_PORT])
                .config("spark.driver.host",
                        os.environ[MODEL_BUILDER_HOST_NAME])
                .config("spark.jars.packages",
                        "org.mongodb.spark:mongo-spark-connector_2.11:2.4.2",
                        )
                .config("spark.scheduler.mode", "FAIR")
                .config("spark.scheduler.pool", "modelBuilder")
                .config("spark.scheduler.allocation.file",
                        "./fairscheduler.xml")
                .master("spark://"
                        + os.environ[SPARKMASTER_HOST]
                        + ":"
                        + str(os.environ[SPARKMASTER_PORT])
                        )
                .getOrCreate()
        )

        (features_training, features_testing, features_evaluation) = \
            self.modeling_code_processing(
                modeling_code,
                spark_session)

        classifier_switcher = {
            "LR": LogisticRegression(),
            "DT": DecisionTreeClassifier(),
            "RF": RandomForestClassifier(),
            "GB": GBTClassifier(),
            "NB": NaiveBayes(),
        }
        classifier_threads = []

        for name, metadata in classifiers_metadata.items():
            classifier = classifier_switcher[name]
            classifier_threads.append(
                self.thread_pool.submit(
                    Model.classifier_processing,
                    classifier,
                    features_training,
                    features_testing,
                    features_evaluation,
                    metadata,
                )
            )

        for classifier in classifier_threads:
            testing_prediction, metadata_document = classifier.result()
            self.save_classifier_result(
                testing_prediction,
                metadata_document
            )

        spark_session.stop()
Example #22
0
def decisionTree(training_data, test_data):
    tree_classifier = DecisionTreeClassifier(
        featuresCol=
        "features",  # datovy stlpec obsahujuci vektor vstupnych atributov
        labelCol=
        "Casualty_Severity_Index",  # datovy stlpec obsahujuci cielovy atribut (indexy tried)
        impurity=
        "entropy",  # pre vyber atributov pri deleni sa pouzije kriterium informacneho zisku
        maxDepth=5)  # ohranicime maximalnu hlbku generovaneho stromu

    tree_model = tree_classifier.fit(training_data)
    predictions = tree_model.transform(test_data)

    test_error = predictions.filter(
        predictions["prediction"] != predictions["Casualty_Severity_Index"]
    ).count() / float(test_data.count())
    print "Testing error: {0:.4f}".format(test_error)
    return predictions
Example #23
0
def decision_tree_generator(training_data,deal_id):  
  ####In: 
  #A training data set
  #The deal_id you want to generate a decision tree for
  
  ####Out
  #The tree is saved
  #An update message is outputted
  
  training_data = training_data.withColumnRenamed(deal_id,'label')
  dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",
                              maxDepth=8,impurity="entropy",
                             algo="classification",numClasses=2)
  model = dt.fit(training_data)
  model.write().overwrite().save(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/")
  output_message = "Saved a Decision Tree for "+deal_id+"."
  
  return model
def decisionTreeClassifier(trainingData, testData, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    import numpy as np
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import time

    dt = DecisionTreeClassifier(labelCol="label",
                                featuresCol="features",
                                maxDepth=15,
                                maxBins=15,
                                impurity='entropy')
    timer = ''
    start = time.time()
    cvModelDT = dt.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModelDT.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)

    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModelDT.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])
    return feat, accuracy, areaUC, timer
    def compute_decision_tree(self):

        dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
        stages = [self.featurizer, dt]
        paramGrid = ParamGridBuilder() \
            .baseOn([self.train_pipeline.stages, stages]) \
            .addGrid(self.featurizer.modelName, self.featurizers) \
            .build()

        return paramGrid
def get_model(classifier, params):
    """
    TODO: Add support for params in pyspark ML
    :param classifier:
    :param params:
    :return:
    """
    if classifier == 'Decision Tree':
        return DecisionTreeClassifier(labelCol="cardio",
                                      featuresCol="features")
    return RandomForestClassifier(labelCol="cardio", featuresCol="features")
Example #27
0
	def testWorkflow(self):
		df = self.sqlContext.read.csv(irisCsvFile, header = True, inferSchema = True)
		
		formula = RFormula(formula = "Species ~ .")
		classifier = DecisionTreeClassifier()
		pipeline = Pipeline(stages = [formula, classifier])
		pipelineModel = pipeline.fit(df)
		
		pmmlBytes = toPMMLBytes(self.sc, df, pipelineModel)
		pmmlString = pmmlBytes.decode("UTF-8")
		self.assertTrue(pmmlString.find("<PMML xmlns=\"http://www.dmg.org/PMML-4_3\" version=\"4.3\">") > -1)
    def __init__(self, data):
        tokenizer = Tokenizer(inputCol="text", outputCol="words")

        vectorizer = CountVectorizer(inputCol="words", outputCol="rawFeatures")

        idf = IDF(minDocFreq=3, inputCol="rawFeatures", outputCol="features")

        dt = DecisionTreeClassifier(maxDepth=30, maxBins=128, minInstancesPerNode=5, maxMemoryInMB=4096)

        pipeline = Pipeline(stages=[tokenizer, vectorizer, idf, dt])

        self.model = pipeline.fit(data)
Example #29
0
def retrain_full_model(data, model_type, paramMap):
    '''
  This function takes the whole dataset and retrains the given model with best parameters. 
   
  Arguments: 
    data {PySpark Dataframe} -- A PySpark Dataframe containing feature vectors and labels
    paramMap {dict} -- A dictionary of the best parameter values
    model_type {str} -- The type of model to train 
    
  Returns:
    model -- Returns the model retrained on full dataset
  '''

    if model_type == 'logistic':
        lr = LogisticRegression()
        model = lr.fit(data, paramMap)
    elif model_type == 'decisiontree':
        dt = DecisionTreeClassifier()
        model = dt.fit(data, paramMap)

    return model
Example #30
0
def classify_target():
    """Forecast binary target."""

    df = sql.read.parquet(str(DATA_PARQUET))
    features = ['cost', 'call_duration_minutes', 'data_volume_mb']
    variables = features + ['test_flag', 'target']

    pipeline_prepare = Pipeline(stages=[
        VectorAssembler(inputCols=features, outputCol='features'),
    ])

    prepared = pipeline_prepare.fit(df).transform(df.dropna(subset=variables))
    training = prepared.filter(col('test_flag') == 0)
    testing = prepared.filter(col('test_flag') == 1)
    training_small = training.sample(fraction=0.3, seed=100500)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='target')

    breakpoint()

    # Logistic regression

    classifier = LogisticRegression(regParam=0.3,
                                    elasticNetParam=0,
                                    featuresCol='features',
                                    labelCol='target',
                                    predictionCol='prediction',
                                    probabilityCol='probability')
    model = classifier.fit(training_small)
    predicted = model.transform(testing)
    print('Test Area Under ROC: ', evaluator.evaluate(predicted))

    breakpoint()

    # Decision Tree Classifier

    classifier = DecisionTreeClassifier(featuresCol='features',
                                        labelCol='target',
                                        maxDepth=3)
    model = classifier.fit(training_small)
    predicted = model.transform(testing)
    print('Test Area Under ROC: ', evaluator.evaluate(predicted))

    breakpoint()

    # Random Forest Classifier
    rf = RandomForestClassifier(featuresCol='features', labelCol='label')
    model = classifier.fit(training_small)
    predicted = model.transform(testing)
    print('Test Area Under ROC: ', evaluator.evaluate(predicted))

    breakpoint()
    def __init__(self, classifier_class, num_classes=None, numerical_features_index=None, nominal_features_index=None,
                 fine_nominal_features_index=None, classifier_opts=None, epochs_number=None, level=None, fold=None,
                 classify=None, workers_number=None, arbitrary_discr='', weight_features=True):

        self.spark_session = SingletonSparkSession.get_session()
        self.scale = False
        self.probas_ = None
        self.is_keras = False
        self.workers_number = workers_number
        self.epochs_number = epochs_number

        if classifier_class == 'drf':
            self._classifier = RandomForestClassifier(featuresCol='features', labelCol='categorical_label',
                                                      predictionCol='prediction', probabilityCol='probability',
                                                      rawPredictionCol='rawPrediction',
                                                      maxDepth=20, maxBins=128, minInstancesPerNode=1,
                                                      minInfoGain=0.0, maxMemoryInMB=1024, cacheNodeIds=False,
                                                      checkpointInterval=10,
                                                      impurity='gini', numTrees=100, featureSubsetStrategy='sqrt',
                                                      seed=None, subsamplingRate=1.0)
        elif classifier_class == 'dnb':
            self._classifier = NaiveBayes(featuresCol='scaled_features', labelCol='categorical_label',
                                          predictionCol='prediction', probabilityCol='probability',
                                          rawPredictionCol='rawPrediction', smoothing=1.0,
                                          modelType='multinomial', thresholds=None, weightCol=None)
            self.scale = True
        elif classifier_class == 'dgb':
            self._classifier = GBTClassifier(featuresCol='features', labelCol='categorical_label',
                                             predictionCol='prediction', maxDepth=5, maxBins=32,
                                             minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
                                             cacheNodeIds=False, checkpointInterval=10, lossType='logistic',
                                             maxIter=20, stepSize=0.1, seed=None,
                                             subsamplingRate=1.0, featureSubsetStrategy='all')
        elif classifier_class == 'ddt':
            self._classifier = DecisionTreeClassifier(featuresCol='features', labelCol='categorical_label',
                                                      predictionCol='prediction', probabilityCol='probability',
                                                      rawPredictionCol='rawPrediction', maxDepth=5, maxBins=32,
                                                      minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256,
                                                      cacheNodeIds=False, checkpointInterval=10, impurity='gini',
                                                      seed=None)
        elif classifier_class.startswith('dk'):
            depth = classifier_opts[0]
            self.keras_wrapper = SklearnKerasWrapper(*classifier_opts, model_class=classifier_class[1:],
                                                     epochs_number=epochs_number, num_classes=num_classes,
                                                     nominal_features_index=[], fine_nominal_features_index=[],
                                                     numerical_features_index=numerical_features_index + fine_nominal_features_index + nominal_features_index,
                                                     level=level, fold=fold, classify=classify,
                                                     weight_features=weight_features, arbitrary_discr=arbitrary_discr)
            self._classifier = self.keras_wrapper.init_model()[2]
            self.nominal_features_index = nominal_features_index
            self.is_keras = True

        self.model_ = None
display(selected)

# COMMAND ----------

# MAGIC %md
# MAGIC ####Decision Trees
# MAGIC You can read more about Decision Trees from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-decision-tree.html).
# MAGIC 
# MAGIC Decision Trees is a popular algorithm as it can handle categorical data and work with multiclass data.

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

# COMMAND ----------

# MAGIC %md We can extract the number of nodes in our decision tree as well as the tree depth of our model.

# COMMAND ----------

print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

# COMMAND ----------
def main(): 
	root =  os.path.dirname(os.path.abspath(__file__))

	print("Digits Handwriting Recognition using Spark")
	print("Root file path is = %s" %root)
	conf = SparkConf().setAppName("OCR")
	sc = SparkContext(conf = conf)
	sc.setLogLevel("WARN")

	sqlContext = SQLContext(sc)


	print("loading dataset")
	trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist")
	testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t")

	# check if rdd support toDF
	if not hasattr(trainRDD, "toDF"):
        	print("ERROR: RDD does not support toDF")
        	os.exit(1)


	## convert RDDs to data frames
	trainDF = trainRDD.toDF()
	testDF = testRDD.toDF()

	print("INFO: train dataframe count = %u" %trainDF.count())
	print("INFO: test dataframe count = %u" %testDF.count())

	indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
	dtc = DecisionTreeClassifier(labelCol="indexedLabel")

	pipeline = Pipeline(stages=[indexer, dtc])
	model = pipeline.fit(trainDF)


	## train multiple depth models
	variedMaxDepthModels = []
	
	print("Create varied depth CNN models [1..8]")
	for mdepth in xrange(1, 9):
		start = time.time()			

		## maximum depth
		dtc.setMaxDepth(mdepth)
		
		## create pipeline
		pipeline = Pipeline(stages = [indexer, dtc])
		
		## create the model
		model = pipeline.fit(trainDF)
		
		## add to varied container
		variedMaxDepthModels.append(model)

		end = time.time()

		print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start))
	
	print("=================================================")

	## report model accuraries
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision")
	
	## mdepth
	print("Evaluate all models precision")
	for mdepth in xrange(1, 9):
		model = variedMaxDepthModels[mdepth - 1]
		
		predictions  = model.transform(testDF)
		
		precision = evaluator.evaluate(predictions)
		
		print("CNN depth = %u, precision = %.3f" %(mdepth, precision))

				
		
	print("Finished processing %u digits" %testDF.count())
Example #34
0
#String Indexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(dfUSD)
td = si_model.transform(dfUSD)
td.collect()
td.show()

#Splitting data
(trainingData, testData) = td.randomSplit([0.6, 0.4])
trainingData.count()
testData.count()
testData.collect()

#Creating decision tree model
dtClassifer = DecisionTreeClassifier(labelCol="indexed",minInstancesPerNode=1500)
dtModel = dtClassifer.fit(trainingData)
dtModel.numNodes
dtModel.depth

#Predict on the test data
predictions = dtModel.transform(trainingData)
predictions = dtModel.transform(testData)
predictions.select("prediction","indexed","label","features").show(10)

#Evaluation
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions)

#Draw a confusion matrix
Example #35
0
#section 8.2.6
# OneVsRest is not available in Python.

#section 8.3.1
from pyspark.ml.feature import StringIndexer
dtsi = StringIndexer(inputCol="label", outputCol="label-ind")
dtsm = dtsi.fit(penlpoints)
pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed("label-ind", "label")

pendtsets = pendtlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20)
dtmodel = dt.fit(pendttrain)

# rootNode is not accessible in Python

dtpredicts = dtmodel.transform(pendtvalid)
dtresrdd = dtpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
dtmm = MulticlassMetrics(dtresrdd)
dtmm.precision()
#0.951442968392121
print(dtmm.confusionMatrix())
#DenseMatrix([[ 205.,    0.,    3.,    0.,    0.,    3.,    1.,    0.,    0.,
#                 0.],
#             [   0.,  213.,    0.,    1.,    2.,    1.,    0.,    2.,    0.,
# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()



#**********************************************************************
#-----------Training the model for prediction--------------------------
#**********************************************************************


from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),labelCol=string_indexer.getOutputCol())
dt_model = dt.fit(dfTrainFinal)



# On applique le même à notre ensemble de test ridicule.
# En théorie le pipeline permet d'automatiser tout ça mais bon, on s'en servira probablement pas

# EDIT : en fait c'est plutot facile de créer des transformers à partir de chaque étape, donc peut 
# être que les pipelines c'est faisables. A voir
df_test_words = tokenizer.transform(dfTest)
df_test_tf = htf.transform(df_test_words)
df_test_tfidf = idfModel.transform(df_test_tf)
df_test_final = string_indexer_model.transform(df_test_tfidf)
# Les prédictions
df_test_pred = dt_model.transform(df_test_final)