Beispiel #1
0
def mpc_multiple_predict(mpc_model_path, df, condition):
    """
    mpc多分类预测
    :param mpc_model_path: 模型地址
    :param df: 数据
    :param condition: {"features": [12, 13, 14, 15], "label": "label"}
    特征列
    :return: 预测结果 sparkframe
    """
    feature_indexs = condition['features']
    label_index = condition['label']

    if label_index is None or label_index == "":  # 无标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return Row(features=Vectors.dense(features_data))

        training_set = df.rdd.map(lambda x: func(x)).toDF()

        # 2.加载模型
        mpc_model = MultilayerPerceptronClassificationModel.load(
            mpc_model_path)

        # 3.预测
        prediction_df = mpc_model.transform(training_set).select(
            "prediction", "features")
        return prediction_df
    else:  # 有标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return Row(label=x[label_index],
                       features=Vectors.dense(features_data))

        training_set = df.rdd.map(lambda x: func(x)).toDF()

        # 2.加载模型
        print("*****mpc_model_path:", mpc_model_path)
        mpc_model = MultilayerPerceptronClassificationModel.load(
            mpc_model_path)

        # 3.预测
        prediction_df = mpc_model.transform(training_set).select(
            "prediction", "label", "features")
        return prediction_df
Beispiel #2
0
def mpc(ss, data, label_index, feature_indexs, project_url):
    # 1.构造训练数据集
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=label_index, features=Vectors.dense(features_data))

    training_set = data.rdd.map(lambda x: func(x)).toDF()

    # 2.训练模型
    # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None
    mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs")
    mpc_param.setSeed(1)
    mpc_param.setLayers([4, 2, 2])
    mpc_model = mpc_param.fit(training_set)

    # 3.保存模型
    model_path = project_url + '/model/multipleClassification/mpc'
    mpc_model.write().overwrite().save(model_path)

    # 4.读取模型
    mpc2 = MultilayerPerceptronClassificationModel.load(model_path)

    # 5.预测
    result = mpc2.transform(training_set).select("prediction", "features").show()
 def __init__(self, host, port):
     super(MQTTThread, self).__init__()
     self.host = host
     self.port = port
     self.client = mqtt.Client()
     self.client.on_connect = self.on_connect
     self.client.on_message = self.on_message
     self.sc = SparkContext()
     self.sqlContext = SQLContext(self.sc)
     self.savedmodel = MultilayerPerceptronClassificationModel.load("/home/ubuntu/trained-model")
Beispiel #4
0
    def test_multilayer_load(self):
        df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])),
                                         (1.0, Vectors.dense([0.0, 1.0])),
                                         (1.0, Vectors.dense([1.0, 0.0])),
                                         (0.0, Vectors.dense([1.0, 1.0]))],
                                        ["label", "features"])

        mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123)
        model = mlp.fit(df)
        self.assertEqual(model.getSolver(), "l-bfgs")
        transformed1 = model.transform(df)
        path = tempfile.mkdtemp()
        model_path = path + "/mlp"
        model.save(model_path)
        model2 = MultilayerPerceptronClassificationModel.load(model_path)
        self.assertEqual(model2.getSolver(), "l-bfgs")
        transformed2 = model2.transform(df)
        self.assertEqual(transformed1.take(4), transformed2.take(4))
def test_model():
    # model = LinearSVCModel.load("save/bert_svc")
    # model = LogisticRegressionModel.load("save/bert_logistic")
    # model = MultilayerPerceptronClassificationModel.load("save/word2vec_nn")
    model = MultilayerPerceptronClassificationModel.load("save/tencent2vec_nn")

    predictions = model.transform(test)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Accuracy = %g " % accuracy)

    result = predictions.select("prediction", "label", "comment").collect()
    for row in result:
        print("label=%s, prediction=%s, comment=%s" %
              (row.label, row.prediction, row.comment))


# FeedforwardNeuralNet(feature_size)
# test_model()
    # We remove the StopWords
    tweets_df = StopWordsRemover(
        inputCol="lowercase_tweets",
        outputCol="processed_tweets").transform(tweets_df)
    # We drop the unused columns
    tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "lang",
                               "date")
    # We load the language model
    model_path = "s3://" + bucket_name + "/models/w2v_model"
    loaded_model = Word2VecModel.load(model_path)
    # We add the output columns : it is the average of the words' vectors for each tweet
    tweets_df = loaded_model.transform(tweets_df)

    # We load the classifier
    clf_path = "s3://" + bucket_name + "/models/mpc_model"
    loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path)
    predictions = loaded_clf.transform(tweets_df)

    # We keep the probability only for the predicted sentiment
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
    predictions = predictions.withColumn("probability",
                                         to_array("probability"))
    predictions = predictions.withColumn("probability",
                                         array_max("probability"))

    # We assign a weight of 0.5 to negative tweets
    compute_weights = udf(lambda x: x if x == 1.0 else 0.5, FloatType())

    # The sentiment score is in [0, 0.5] if the value is negative and [0.5, 1] if positive
    predictions = predictions.withColumn("weights", compute_weights("prediction")) \
    .withColumn("sentiment_score", col("probability")*col("weights")) \
		# spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
		spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.cn-northwest-1.amazonaws.com.cn")

	return spark



if __name__ == '__main__':
	spark = prepare()

	# 1. load the data
	df_result = load_training_data(spark)
	df_validate = df_result #.select("id", "label", "features").orderBy("id")

	# 2. load model
	model = MultilayerPerceptronClassificationModel.load("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/nw")

	# 3. compute accuracy on the test set
	result = model.transform(df_validate)
	result.persist()
	predictionAndLabels = result.select("id", "prediction", "label").orderBy("id")
	evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
	print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

	# 4. Test with Pharbers defined methods
	result = result.withColumn("JACCARD_DISTANCE_MOLE_NAME", result.JACCARD_DISTANCE[0]) \
				.withColumn("JACCARD_DISTANCE_DOSAGE", result.JACCARD_DISTANCE[1]) \
				.drop("JACCARD_DISTANCE", "features").drop("rawPrediction", "probability")
	result.printSchema()
	result.orderBy("id").repartition(1).write.mode("overwrite").csv("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/result")
	df_ph = result.where((result.prediction == 1.0) | (result.label == 1.0))