Python DecisionTreeClassificationModelの例、pyspark.ml.classification.DecisionTreeClassificationModel Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_decision_tree_spark_classifier.py プロジェクト: zhangge001/dtreeviz

def tree_model() -> (DecisionTreeClassificationModel):
    SparkSession.builder \
        .master("local[2]") \
        .appName("dtreeviz_sparkml") \
        .getOrCreate()

    spark_major_version = int(pyspark.__version__.split(".")[0])
    if spark_major_version >= 3:
        return DecisionTreeClassificationModel.load(
            "fixtures/spark_3_0_decision_tree_classifier.model")
    elif spark_major_version >= 2:
        return DecisionTreeClassificationModel.load(
            "fixtures/spark_2_decision_tree_classifier.model")

コード例 #2

0

ファイルを表示

ファイル: open.py プロジェクト: shh2000/buaa_bigdata_baseline

def test(spark):
    sc = spark.sparkContext

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=8000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    srcdf = sc.textFile('predict.csv').map(parse_line)
    testing = srcdf.toDF()

    model = DecisionTreeClassificationModel.load('Bayes20000')

    testWordsData = tokenizer.transform(testing)
    testFeaturizedData = hashingTF.transform(testWordsData)
    testIDFModel = idf.fit(testFeaturizedData)
    testRescaledData = testIDFModel.transform(testFeaturizedData)
    testRescaledData.persist()

    testDF = testRescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    predictions = model.transform(testDF)
    predictions.select('prediction').write.csv(path='submit',
                                               header=True,
                                               sep=',',
                                               mode='overwrite')

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("The accuracy on test-set is " + str(accuracy))

コード例 #3

0

ファイルを表示

def tree_model() -> (DecisionTreeClassificationModel):
    SparkSession.builder \
        .master("local[2]") \
        .appName("dtreeviz_sparkml") \
        .getOrCreate()
    return DecisionTreeClassificationModel.load(
        "fixtures/spark_decision_tree_classifier.model")

コード例 #4

0

ファイルを表示

    def read_model(self):

        if "LogisticRegression" in self.best_model_path:
            classifier = LogisticRegressionModel.load(self.best_model_path)

        elif "DecisionTree" in self.best_model_path:
            classifier = DecisionTreeClassificationModel.load(
                self.best_model_path)

        elif "RandomForest" in self.best_model_path:
            classifier = RandomForestClassificationModel.load(
                self.best_model_path)

        elif "LinearSVC" in self.best_model_path:
            classifier = LinearSVCModel.load(self.best_model_path)

        if "VGG16" in self.best_model_path:
            featurizer_name = "VGG16"

        elif "VGG19" in self.best_model_path:
            featurizer_name = "VGG19"

        elif "InceptionV3" in self.best_model_path:
            featurizer_name = "InceptionV3"

        elif "Xception" in self.best_model_path:
            featurizer_name = "Xception"

        elif "ResNet50" in self.best_model_path:
            featurizer_name = "ResNet50"

        return featurizer_name, classifier

コード例 #5

0

ファイルを表示

ファイル: loadmodeltest.py プロジェクト: cloud17shield/Model_Pyfiles

def handler(message):
    records = message.collect()
    for record in records:
        print('record', record, type(record))
        print('-----------')
        print('tuple', record[0], record[1], type(record[0]), type(record[1]))
        # producer.send(output_topic, b'message received')
        key = record[0]
        value = record[1]
        if len(key) > 10:
            image_path = value
            image_DF = dl.readImages(image_path)
            image_DF.show()
            tested_lr_test = p_lr_test.transform(image_DF)
            # tested_lr_test.show()
            predict_value = tested_lr_test.select('prediction').head()[0] - 1
            print('predict', predict_value)
            print('byte predict', str(predict_value).encode('utf-8'))
            print('byte key', str(key).encode('utf-8'))
            producer.send(output_topic,
                          key=str(key).encode('utf-8'),
                          value=str(predict_value).encode('utf-8'))
            producer.flush()
            print('predict over')
        elif len(key) == 10:
            print('entered csv model part')
            modelloaded = DecisionTreeClassificationModel.load(
                "hdfs:///treemodelofcsv")
            NewInput = Row('Type', 'Age', 'Breed1', 'Breed2', 'Gender',
                           'Color1', 'Color2', 'Color3', 'MaturitySize',
                           'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized',
                           'Health', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt')
            value_lst = str(value).split(',')
            print('value_lst', value_lst)
            print('lst_len', len(value_lst))
            new_input = NewInput(int(value_lst[0]), int(value_lst[1]),
                                 int(value_lst[2]), int(value_lst[3]),
                                 int(value_lst[4]), int(value_lst[5]),
                                 int(value_lst[6]), int(value_lst[7]),
                                 int(value_lst[8]), int(value_lst[9]),
                                 int(value_lst[10]), int(value_lst[11]),
                                 int(value_lst[12]), int(value_lst[13]),
                                 int(value_lst[14]), int(value_lst[15]),
                                 int(value_lst[16]), value_lst[17])
            df_new_input = sql_sc.createDataFrame([new_input])
            df_new_input.show()
            df_new_input = pipeline.fit(df_new_input).transform(df_new_input)
            df_new_input = feature.transform(df_new_input)
            new_predict = modelloaded.transform(df_new_input)
            new_predict.show()
            predict_value = str(new_predict.select('prediction').head()[0])
            print('predict value', predict_value.encode('utf-8'))
            producer.send(output_topic,
                          key=str(key).encode('utf-8'),
                          value=predict_value.encode('utf-8'))
            producer.flush()

コード例 #6

0

ファイルを表示

ファイル: Clasificacion.py プロジェクト: diegoabdul/Burnout

def DecisionTree(data):
    path = 'modelo_DecisionTree/modelDecisionTree'
    DecisionTree = DecisionTreeClassificationModel.load(path)
    predictions = DecisionTree.transform(data)
    prediccion = predictions.select(
        'prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    print(prediccion[0])
    if prediccion[0] == 1.0:
        prediccionLabel = 'FALSO'
    else:
        prediccionLabel = 'VERDADERO'

    return prediccionLabel, prediccion[1][0] * 100

コード例 #7

0

ファイルを表示

def decision_tree_classifier():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")
    model = dt.fit(td)
    # model.numNodes
    # # 3
    # model.depth
    # # 1
    # model.featureImportances
    # # SparseVector(1, {0: 1.0})
    # model.numFeatures
    # # 1
    # model.numClasses
    # # 2
    print(model.toDebugString)
    # DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes...
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    result = model.transform(test0).head()
    # result.prediction
    # # 0.0
    # result.probability
    # # DenseVector([1.0, 0.0])
    # result.rawPrediction
    # # DenseVector([1.0, 0.0])
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    # model.transform(test1).head().prediction
    # # 1.0
    temp_path = "."
    dtc_path = temp_path + "/dtc"
    dt.save(dtc_path)
    dt2 = DecisionTreeClassifier.load(dtc_path)
    # dt2.getMaxDepth()
    # # 2
    model_path = temp_path + "/dtc_model"
    model.save(model_path)
    model2 = DecisionTreeClassificationModel.load(model_path)

コード例 #8

0

ファイルを表示

def detect(log_entries_df,
           model_path='/shared/models/sentiment/overall/optimized',
           additional_cols=True,
           incl_idf=False,
           reduced_feature_set=True):

    # load the pre-trained 'Decision Tree Classifier'
    model = DecisionTreeClassificationModel.load(model_path)

    # preprocess the raw log statements commin from kafka
    preprocessed_df = preprocess(log_entries_df, additional_cols,
                                 reduced_feature_set)

    # apply the Feature Vectorization to these Pre-Processed log entries (Tokens)
    features_df = vectorize(preprocessed_df, incl_idf)

    # apply the Trained Decision Tree Classifier to the Pre-Processed Feature Vectors to predict and classify sentiment
    if additional_cols:
        predictions_df = model.transform(features_df).select(
            "text", "prediction", cfg.key_col_name, cfg.value_col_name)

        # .filter("prediction == 1.0") \
        # anomalies_df = predictions_df \
        #     .select(concat(lit('{"prediction":"'), when(col("prediction") == 1.0, lit("anomaly")).otherwise(lit("normal")),
        #                    lit('","classifier":"decision_tree","uuid":"'), col(cfg.key_col_name), lit('","log_stmt":"'), col(cfg.value_col_name), lit('"}')).alias("value"))

        predictions_df = predictions_df.withColumn('classifier',
                                                   lit('sentiment'))
        anomalies_df = predictions_df.select(
            when(col("prediction") == 1.0,
                 lit("anomaly")).otherwise(lit("normal")).alias("prediction"),
            'classifier', cfg.key_col_name, cfg.value_col_name)

    else:
        predictions_df = model.transform(features_df).select(
            "text", "prediction")

        # .filter("prediction == 1.0") \
        anomalies_df = predictions_df.select(
            concat(
                lit('{"prediction":"'),
                when(col("prediction") == 1.0,
                     lit("anomaly")).otherwise(lit("normal")),
                lit('","text":"'), col('text'), lit('"}')).alias("value"))

    return anomalies_df

コード例 #9

0

ファイルを表示

    def prediction(self, infoData):
        isNgram = False if infoData.get(pc.ISNGRAM) == None else infoData.get(
            pc.ISNGRAM)
        predictionColm = infoData.get(pc.PREDICTIONCOLM)
        algoName = infoData.get(pc.ALGORITHMNAME)
        modelStorageLocation = infoData.get(pc.MODELSTORAGELOCATION)
        spark = infoData.get(pc.SPARK)
        datasetPath = infoData.get(pc.SENTIMENTDATASETPATH)
        originalDataset = spark.read.parquet(datasetPath)
        originalDataset = pu.addInternalId(originalDataset)
        infoData.update({pc.DATASET: originalDataset})

        infoData = self.dataTransformation(infoData)

        dataset = infoData.get(pc.DATASET)
        if (isNgram):
            """sahil-- handle the none value for ngram parameter at the time of data creation"""
            textProcessing = TextProcessing()
            ngramPara = infoData.get(pc.NGRAMPARA)
            dataset = textProcessing.ngrams(dataset, pc.DMXLEMMATIZED,
                                            ngramPara)
        """
        -- sahil- hardCoding the algorithm name for comparision handle this while finalising
        """
        if ("GradientBoostClassifier".__eq__(algoName)):
            predictionModel = GBTClassificationModel.load(modelStorageLocation)
        if ("DecisionTreeClassifier".__eq__(algoName)):
            predictionModel = DecisionTreeClassificationModel.load(
                modelStorageLocation)

        dataset = dataset.drop(predictionColm)
        originalDataset = originalDataset.drop(predictionColm)
        dataset = predictionModel.transform(dataset)
        """calling indexToString method after the prediction"""
        infoData.update({pc.DATASET: dataset})
        infoData = self.invertIndex(infoData)

        dataset = infoData.get(pc.DATASET)
        dataset = dataset.select(pc.DMXINDEX, predictionColm)
        finalDataset = pu.joinDataset(originalDataset, dataset, pc.DMXINDEX)
        return finalDataset

コード例 #10

0

ファイルを表示

def decision_tree_evaluator(test_data,deal_id):  
  ####In: 
  #A testing data set
  #The deal_id you want to test a tree for
  #NB: The model tree to be already saved to the cloud 
  
  ####Out
  #An update message is outputted
  #an evaluator
  
  
  model = DecisionTreeClassificationModel.load(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/")
  predictions = model.transform(test_data.withColumnRenamed(deal_id,'label'))
  # compute accuracy on the test set
  evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction",
                                                metricName="areaUnderPR") #alternatively, use areaUnderPR to get the precision-recall curve instead of the accuracy
  
  accuracy = evaluator.evaluate(predictions)
  print("Decision Tree area under PR " + deal_id +  " = " + str(accuracy))
  
  return evaluator

コード例 #11

0

ファイルを表示

ファイル: 1z. Bernoulli_model_evaluators.py プロジェクト: LotteVanUtrecht/masterthesis-databricks

def get_metrics(deal_id, test_data=market_test):
    ####In:
    #A testing data set, as generated by data_prep()
    #The deal_id you want to test a model for

    ####Out
    #The two sets of accuracies and are unders the PR curves are outputted

    #import models
    model_lr = LogisticRegressionModel.load(
        f"/mnt/lotte/logistic_regression/{deal_id}/")
    model_trees = DecisionTreeClassificationModel.load(
        f"/mnt/lotte/decision_trees/{deal_id}/")

    #fit models
    predictions_lr = model_lr.transform(
        test_data.withColumnRenamed(deal_id, 'label'))
    predictions_trees = model_trees.transform(
        test_data.withColumnRenamed(deal_id, 'label'))

    #define evaluators
    evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    evaluator_area = BinaryClassificationEvaluator(
        labelCol="label",
        rawPredictionCol="prediction",
        metricName="areaUnderPR")

    #get metrics
    lr_accuracy = evaluator_accuracy.evaluate(predictions_lr)
    lr_area = evaluator_area.evaluate(predictions_lr)
    trees_accuracy = evaluator_accuracy.evaluate(predictions_trees)
    trees_area = evaluator_area.evaluate(predictions_trees)

    #gather metrics
    metrics = [lr_accuracy, lr_area, trees_accuracy, trees_area]
    return metrics

コード例 #12

0

ファイルを表示

def apply_decision_tree_classifier(tweets):
    model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \
                 'Sentiment_Analysis_-_Decision_tree.0000'
    model = DecisionTreeClassificationModel.load(model_path)
    return model.transform(tweets)

コード例 #13

0

ファイルを表示

training2_1 = rf_model1.transform(training)

PredictionsandLabels = training2_1.select('prediction','Survived').rdd

PredictionsandLabels.collect()

# 2 learning process - created a model

model22 = dt1.fit(training)
model22.depth
model22.numFeatures

model22.save('E:/kaggle/model22')

model120 = DecisionTreeClassificationModel()
model122 = model120.load('E:/kaggle/model22')

training4 = model122.transform(training)

training4.show(3)
model23 = 

training2 = model22.transform(training)

PredictionsandLabels = training2.select('prediction','Survived').rdd

PredictionsandLabels.collect()
# --------------------------------------------------------------

#Resubstitution approach

コード例 #14

0

ファイルを表示

                                     [5, 4.91, 5.11],
                                     [0, 5.95, 8.17],
                                     [1, 6.41, 8.34],
                                     [4, 5.73, 5.93],
                                     [2, 6.39, 7.45],
                                     [3, 7.29, 6.5]]),
                           columns=['Hour', 'true','simulation'])\
.astype({'Hour': 'int32'})\
.sort_values(by="Hour")

# COMMAND ----------

ax = hour_prices[["true", "simulation"]].plot(use_index=False,
                                              xlim=(0, 23),
                                              ylim=(0, 10))
ax.legend()

# COMMAND ----------

market_predictions\
.select("lr_max_bid","trees_max_bid","true_winning_max_bid")\
.describe()\
.show()

# COMMAND ----------

deal_id = "e86f7061c"
model_trees = DecisionTreeClassificationModel.load(
    f"/mnt/lotte/decision_trees/{deal_id}/")
print(model_trees.toDebugString)
#deal_id = "48f06d9af"

コード例 #15

0

ファイルを表示

ファイル: pysparkTry.py プロジェクト: kyokagong/python3learning

def load():
    spark = createLocalSparkSession()
    obj = DecisionTreeClassificationModel.load('tmp')

コード例 #16

0

ファイルを表示

def _get_root_node(tree: DecisionTreeClassificationModel):
    return tree._call_java('rootNode')

コード例 #17

0

ファイルを表示

ファイル: kafka_twitter_consumer.py プロジェクト: B2BDA/Pyspark-ML-Templates-Book

# (1) Import our Config file and our Pre-Processing and Feature Vectorisation pipeline functions
import config
import model_pipelines

__author__ = "Jillur Quddus"
__credits__ = ["Jillur Quddus"]
__version__ = "1.0.0"
_maintainer__ = "Jillur Quddus"
__email__ = "*****@*****.**"
__status__ = "Development"

# (2) Create a Spark Session using the Spark Context instantiated from spark-submit
spark = SparkSession.builder.appName("Stream Processing - Real-Time Sentiment Analysis").getOrCreate()

# (3) Load the Trained Decision Tree Classifier that we trained and persisted in Chapter 06
decision_tree_model = DecisionTreeClassificationModel.load(config.trained_classification_model_path)

# (4) Spark Structured Streaming does not yet support the automatic inference of JSON Kafka values into a Dataframe without a Schema
# Therefore let us define the Schema explicitly
schema = StructType([
    StructField("created_at", StringType()), 
    StructField("id", StringType()), 
    StructField("id_str", StringType()), 
    StructField("text", StringType()), 
    StructField("retweet_count", StringType()), 
    StructField("favorite_count", StringType()), 
    StructField("favorited", StringType()), 
    StructField("retweeted", StringType()), 
    StructField("lang", StringType()), 
    StructField("location", StringType()) 
])

コード例 #18

0

ファイルを表示

# COMMAND ----------

#maxDepth=10, maxBins=128,maxMemoryInMB=2048,seed=1
for i in range(5):
  decision_tree_generator(market_train,deal_ids[i])
  decision_tree_evaluator(market_test,deal_ids[i])
  
#worse results than the defaults, probably because of overfitting

# COMMAND ----------

#Creates and saves all trees models



for deal_id in deal_ids:
  market_predictions = market_test.select("features","market_guid")
  model = DecisionTreeClassificationModel.load(f"s3://rtl-databricks-datascience/lpater/decision_trees/{deal_id}/")
  market_predictions = model.transform(market_predictions.withColumnRenamed(deal_id,'label'))
  market_predictions.groupBy("probability").count().show(100,False)

# COMMAND ----------

print(deal_ids)

# COMMAND ----------

#Prints the area under the Precision-Recall curve for every model
for deal_id in deal_ids_list:
  print(logistic_regression_evaluator(test_data=market_test,deal_id=deal_id))

コード例 #19

0

ファイルを表示

indexers = [StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in ["AdoptionSpeed"]]
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)
df_test = pipeline.fit(df_test).transform(df_test)

feature = VectorAssembler(inputCols=input_cols, outputCol="features")
feature_vector = feature.transform(df)

feature_vector_test = feature.transform(df_test)
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11)
testData.printSchema()
#testData.show(10)
lr = DecisionTreeClassifier(labelCol="AdoptionSpeed_index", featuresCol="features")
lrModel = lr.fit(trainingData)
lrModel.write().overwrite().save("hdfs:///treemodelofcsv")
modelloaded = DecisionTreeClassificationModel.load("hdfs:///treemodelofcsv")
lr_prediction = modelloaded.transform(testData)
# lr_prediction.select("prediction", "Survived", "features").show()
# evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
evaluator = MulticlassClassificationEvaluator(labelCol="AdoptionSpeed_index", predictionCol="prediction",
                                              metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of DecisionTreeModel is = %g" % (lr_accuracy))
print("Test Error of DecisionTreeModel = %g " % (1.0 - lr_accuracy))
# lr_prediction.show()
lr_prediction = modelloaded.transform(feature_vector_test)
predictions = [int(elem['prediction']) for elem in lr_prediction.select('prediction').collect()]
predictions_ids = [elem['PetID'] for elem in lr_prediction.select('PetID').collect()]
df_new = pd.DataFrame()
df_new['PetID'] = predictions_ids
df_new['AdoptionSpeed'] = predictions

コード例 #20

0

ファイルを表示

ファイル: flask start_old.py プロジェクト: sh994m/Machine-Learning

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

import numpy as np
from flask import Flask, abort, jsonify, request

from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.linalg import Vectors


model4 = DecisionTreeClassificationModel()
model5 = model4.load('E:/kaggle/titanic/dt_model10')

model5.depth
model5.numFeatures

app = Flask(__name__)
@app.route('/api',methods=('GET','POST'))

def make_predict():
    print('hi, good morning ... ')
    data = request.get_json(force=True)
    print(data)
    predict_df = spark.createDataFrame([(1,Vectors.dense(data))],['index','Features'])
    predict_df.show()
    output = model5.transform(predict_df).select('prediction').first()[0]
    print(output)

    return jsonify('Survived' if output==1 else 'Not Survived')

if __name__ == '__main__':

コード例 #21

0

ファイルを表示

ファイル: detector.py プロジェクト: gabrielmldantas/detector_fraude

 def load_model(self):
     if self.model_path.exists():
         return DecisionTreeClassificationModel.load(str(self.model_path))
     return self.train_model(self.load_data())

コード例 #22

0

ファイルを表示

training2_1 = rf_model1.transform(training)

training2_1.select('prediction','Survived').show()

PredictionsandLabels = training2_1.select('prediction','Survived').rdd

PredictionsandLabels.collect()

model22 = dt1.fit(training)
model22.depth
model22.numFeatures

model22.save('/users/jyothsnap/Kaggle/titanic/model22')

model120 = DecisionTreeClassificationModel()
model122 = model120.load('/users/jyothsnap/Kaggle/titanic/model22')

training4 = model122.transform(training)

training4.show(3)
model23 = 

training2 = model22.transform(training)

PredictionsandLabels = training2.select('prediction','Survived').rdd

PredictionsandLabels.collect()
# --------------------------------------------------------------

#Resubstitution approach

コード例 #23

0

ファイルを表示

ファイル: model deploy and run service.py プロジェクト: sh994m/Machine-Learning

# Please run this program from anaconda prompt (command line)
# python "Program path and name"
# python "e:\studyml-lab\Machine-Learning\Deployment\model deploy and run service.py"

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# import numpy as np
from flask import Flask, jsonify, request

from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.linalg import Vectors

model40 = DecisionTreeClassificationModel()
model141 = model40.load('E:/kaggle/model22')
model141.depth
model141.numFeatures

app = Flask(__name__)


@app.route('/api', methods=('GET', 'POST'))
def make_predict():
    print('hi, good morning ... ')

    data = request.get_json(force=True)
    print(data)
    predict_df = spark.createDataFrame([(1, Vectors.dense(data))],
                                       ['index', 'Features'])
    predict_df.show()
    output = model141.transform(predict_df).select('prediction').first()[0]

コード例 #24

0

ファイルを表示

ファイル: sparkAWSEMRClusterDeployment.py プロジェクト: DemondLove/Expedia-Hotel-Prediction-Workflow

# Write the cleased dataset to an s3 bucket in parquet format
dataset.write.parquet("s3://expedia-hotel-recommendations-workflow/spark_OutputCleasedDataset.parquet")


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])

# Fit Decision Tree Algorithm
dtc = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dtcm = dtc.fit(trainingData)

# Save trained Logistic Regression Model to s3 Bucket for future use
dtcm.save('s3://expedia-hotel-recommendations-workflow/dtcm_model')

# Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use
dtcModel = DecisionTreeClassificationModel.load("s3://expedia-hotel-recommendations-workflow/dtcm_model")

# Make predictions with Decision Tree model on the Test Dataset
dtcPredictions = dtcModel.transform(testData)

# Calculate and print Accuracy score for Decision Tree Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dtcAccuracy = evaluator.evaluate(dtcPredictions)
print("Decision Tree accuracy Error = %g" % (dtcAccuracy))

# Calculate and print F1 score for Decision Tree Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
dtcF1 = evaluator.evaluate(dtcPredictions)
print("Decision Tree f1 Error = %g" % (dtcF1))

コード例 #25

0

ファイルを表示

    #load data
    data = None
    if dataType == "libsvm":
        data = sqlContext.read.format("libsvm").load(dataPath)

    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
    elif algoName == "GBTRegression":
        from pyspark.ml.regression import GBTRegressionModel
        model = GBTRegressionModel.load(modelPath)

コード例 #26

0

ファイルを表示

def _get_root_node(tree: DecisionTreeClassificationModel):
    if hasattr(tree, 'trees'):
        return tree.trees[0]._call_java('rootNode')
    else:
        return tree._call_java('rootNode')