def lr_second_predict(lr_model_path, df, condition): """ lr二分类预测 :param lr_model_path: 模型地址 :param df: 数据 :param condition: {"features": [12, 13, 14, 15], "label": "label"} 特征列 :return: 预测结果 spark dataframe """ feature_indexs = condition['features'] label_index = condition['label'] if label_index is None or label_index == "": # 无标签列 # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(features=Vectors.dense(features_data)) training_set = df.rdd.map(lambda x: func(x)).toDF() # 2.加载模型 lr_model = LogisticRegressionModel.load(lr_model_path) # 3.预测 prediction_df = lr_model.transform(training_set).select( "prediction", "features") return prediction_df else: # 有标签列 # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=x[label_index], features=Vectors.dense(features_data)) training_set = df.rdd.map(lambda x: func(x)).toDF() # 2.加载模型 print("*****lr_model_path:", lr_model_path) lr_model = LogisticRegressionModel.load(lr_model_path) # 3.预测 prediction_df = lr_model.transform(training_set).select( "prediction", "label", "features") return prediction_df
def logistic_regression_evaluator(test_data, deal_id): ####In: #A testing data set, as generated by data_prep() #The deal_id you want to test a model for #NB: The model has to be already saved to the cloud ####Out #An update message is outputted #an evaluator model = LogisticRegressionModel.load( f"/mnt/lotte/logistic_regression/{deal_id}/") predictions = model.transform(test_data.withColumnRenamed( deal_id, 'label')) # compute accuracy on the test set evaluator = BinaryClassificationEvaluator( labelCol="label", rawPredictionCol="prediction", metricName="areaUnderPR" ) #alternatively, use areaUnderPR to get the precision-recall curve instead of the accuracy accuracy = evaluator.evaluate(predictions) print("Logistic Regression area under PR " + deal_id + " = " + str(accuracy)) return evaluator
def predict_prob(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['_c0', '_c1', '_c2', 'probability']) print('*************** result **************') print(result.show(5)) # result.write.csv('file:///opt/int_group/result123') vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) new_result_prob1 = result_prob1.select([ '_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str') ]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(10)) print(new_result_prob1) # find null rows final_null_rows = new_result_prob1.filter(new_result_prob1._c0.isNull() | new_result_prob1._c1.isNull()\ | new_result_prob1._c2.isNull() | new_result_prob1.prob_1_str.isNull()) print('########### find null rows #############') final_null_rows.show(100)
def init(): global model # note here "iris.model" is the name of the model registered under the workspace # this call should return the path to the model.pkl file on the local disk. model_path = Model.get_model_path('iris.model') # Load the model file back into a LogisticRegression model model = LogisticRegressionModel.load(model_path)
def post_homeLoanDefault_predictions(Path): ''' Purpose : This function generates predictions based on the input data to a home loan default classifier. Args : Path(dict) Output : prediction(array) ''' from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegressionModel for item in Path: modelPath = item['modelPath'] dataPath = item['dataPath'] spark = SparkSession.builder.appName('HomeCredit').getOrCreate() data = spark.read.parquet(dataPath) #loading Model mm = LogisticRegressionModel.load(modelPath) #calculate predictions predicted = mm.transform(data) predictList = predicted.select('prediction').collect() predictList = [int(i.prediction) for i in predictList] return predictList
def gen_lr_sort_model_metrics(test_df): from pyspark.ml.classification import LogisticRegressionModel logistic_regression_model = LogisticRegressionModel.load( "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model") lr_result = logistic_regression_model.evaluate(test_df).predictions lr_result.show() def vector_to_double(row): return float(row.click_flag), float(row.probability[1]) score_labels = lr_result.select(["click_flag", "probability"]).rdd.map(vector_to_double) score_labels.collect() from pyspark.mllib.evaluation import BinaryClassificationMetrics binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels) area_under_roc = binary_classification_metrics.areaUnderROC print area_under_roc tp = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 1)].count() tn = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count() fp = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count() fn = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 0)].count() print "tp {} tn {} fp {} fn {}".format(tp, tn, fp, fn) print('accuracy is : %f' % ((tp + tn) / (tp + tn + fp + fn))) print('recall is : %f' % (tp / (tp + fn))) print('precision is : %f' % (tp / (tp + fp)))
def load_LogisticReg_Model(dataset): print ("Accuracy of best LRC Model with CrossValidation:") evaluator = BinaryClassificationEvaluator() best_LRModel = LogisticRegressionModel.load("model/LR1/") predictions = best_LRModel.transform(dataset) accuracy = evaluator.evaluate(predictions) print "The accuracy = %g" % accuracy
def on_data(self, data): try: s = self.client_socket s.listen(5) # Now wait for client connection. c, addr = s.accept() # Establish connection with client. print("Received request from: " + str(addr)) msg = json.loads(data) tweet_time = msg['created_at'] text = msg['text'].replace('\n', '') hashtags = " " if msg['entities'] is not None: if msg['entities']['hashtags'] is not None: for hashtag in msg['entities']['hashtags']: hashtags = hashtags + " " + hashtag['text'] model = PipelineModel.load(Constants.sentiment_tf_idf_model_path) v = sql_context.createDataFrame([ ("a", msg['text'].replace('\n', '')), ], ["_c0", "text"]) v = model.transform(v) model2 = LogisticRegressionModel.load(Constants.sentiment_analysis_model_path) v = model2.transform(v) v_list = v.select('prediction').collect() sentiment = str(v_list[0].prediction) s_data = tweet_time + ' ~@ ' + text + ' ~@ ' + sentiment + ' ~@ ' + str(hashtags) print(s_data.encode('utf-8')) c.send(s_data.encode('utf-8')) c.close() except BaseException as e: print("Error on_data: %s" % str(e)) return True
def predict(test_path, model_name, output_path): if model_name is None: model_name = 'model' if output_path is None: output_path = os.path.join(dirname(os.getcwd()), 'predict.csv') model_path = os.path.join(dirname(os.getcwd()), 'models', model_name) spark = SparkSession \ .builder \ .master('local') \ .appName('Logistic App') \ .getOrCreate() # todo Delete the next line spark.sparkContext.setLogLevel('OFF') model = LogisticRegressionModel.load(path=model_path) raw_data = spark.read.csv(test_path, header=True) dataset = mature_data(raw_data) prediction_df = model.transform(dataset).select( col('id'), col('prediction').cast('int')) prediction_df = prediction_df.toPandas() prediction_df.to_csv(output_path, index=False)
def read_model(self): if "LogisticRegression" in self.best_model_path: classifier = LogisticRegressionModel.load(self.best_model_path) elif "DecisionTree" in self.best_model_path: classifier = DecisionTreeClassificationModel.load( self.best_model_path) elif "RandomForest" in self.best_model_path: classifier = RandomForestClassificationModel.load( self.best_model_path) elif "LinearSVC" in self.best_model_path: classifier = LinearSVCModel.load(self.best_model_path) if "VGG16" in self.best_model_path: featurizer_name = "VGG16" elif "VGG19" in self.best_model_path: featurizer_name = "VGG19" elif "InceptionV3" in self.best_model_path: featurizer_name = "InceptionV3" elif "Xception" in self.best_model_path: featurizer_name = "Xception" elif "ResNet50" in self.best_model_path: featurizer_name = "ResNet50" return featurizer_name, classifier
def testJustify(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") # attention:the inputted parameter accuracy = evaluator.evaluate(predictions) print('############ accuracy: {} ############'.format(accuracy)) return predictions
def predict(): model_1 = LogisticRegressionModel.load('model') if request.method == 'POST': message = request.form['message'] data = [message] my_prediction = model_1.transform(data) return render_template('result.html', prediction=my_prediction)
def __init__(self): self.logger = logging.getLogger(__name__) self.spark_sql_context = SparkUtility.get_spark_sql_context() self.spark_session = SparkUtility.get_spark_session() model_path = os.path.join( Utility.get_data_folder(DataFolder.Stock_Model, Market.US), 'lr_model') self.model = LogisticRegressionModel.load(model_path)
def load_model(): lr_model = LogisticRegressionModel.load('s3a://trainingmodel/lr') featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") p_test = PipelineModel(stages=[featurizer, lr_model]) return p_test
def __init__(self, model_path, scaler_path): self.model = LogisticRegressionModel.load(model_path) self.scaler = StandardScalerModel.load(scaler_path) self.au_PRC = 0 self.precision = [] self.recall = [] self.thresholds = [] self.matrix = []
def getOrCreateLRC (self): try: if self.LRCModel == None: self.LRCModel = LogisticRegressionModel.load(CONST_LRC_FILE) except: print ("Creating LinearSVC Model") self.LRCModel = self.createLRC() return self.LRCModel
def __init__(self): self.spark = SparkSession.builder \ .master('yarn') \ .appName("Yelp Online Testing") \ .getOrCreate() self.lda_model = PipelineModel.load( 'hdfs:///project/small_data/lda_model_10') self.lr_model = LogisticRegressionModel.load( 'hdfs:///project/small_data/lr-model-10')
def modelPredicting(testSetWoeDF, fn): # 数据预转换,满足ML-linearRegression输入格式要求 strInd = StringIndexerModel.load(savePath + '{}/{}/strInd'.format(curDate, fn)) lrModel = LogisticRegressionModel.load(savePath + '{}/{}/lrModel'.format(curDate, fn)) testSetVecAse = vecAseembler.transform(testSetWoeDF) testSetVecAseStrInd = strInd.transform(testSetVecAse) testSetWithProba = lrModel.transform(testSetVecAseStrInd) return (testSetWithProba)
def classify(self, inputJson): pass self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner) self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port) if hasattr(self, 'sc')==False: self.sc =SparkContext() if hasattr(self, 'sqlContext')==False: self.sqlContext = SQLContext(self.sc) schema = StructType([StructField('Category', StringType(), True), StructField('Descript', StringType(), True), StructField('Dates', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('PdDistrict', StringType(), True), StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', DoubleType(), True), StructField('Y', DoubleType(), True) ]) test = self.sqlContext.createDataFrame(inputJson, schema) #pipeline= PipelineModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint") pipeline= PipelineModel.load(self.pipelineHdfsPath) testData = pipeline.transform(test) print("Test Dataset Count: " + str(testData.count())) ########################################################## ################## Train/load the model ################## ########################################################## #lrModel = LogisticRegressionModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint") lrModel = LogisticRegressionModel.load(self.modelHdfsPath) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 7) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) #.select("probability","label","prediction") \ resultJson = predictions.filter(predictions['prediction'] == 7) \ .select("prediction") \ .orderBy("probability", ascending=False) \ .toJSON().collect() self.sc.stop() return ["al sana ML!", resultJson]
def predict_prob(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['_c0', '_c1', '_c2', 'probability']) print('*************** result **************') print(result.show(5)) # result.write.csv('file:///opt/int_group/result123') vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) # for i in range(800, 802): # g = i / 1000 # h = g + 0.001 # sqlTrans = SQLTransformer(statement="SELECT _c0, _c1, _c2, prob_1[0] AS prob FROM __THIS__ WHERE prob_1[0] < h AND prob_1[0] >= g") # dd = sqlTrans.transform(result_prob1) # dd.write.csv('file:///opt/int_group/sql_test') new_result_prob1 = result_prob1.select([ '_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str') ]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(5)) print(new_result_prob1) dd = new_result_prob1.head(1000) dd_df = spark.createDataFrame(dd) dd_df.write.csv('file:///opt/int_group/head_1kw_test') # for i in [1,2,3,4,5]: # dd = new_result_prob1.head(i) # dd_df = spark.createDataFrame(dd) # dd_df.write.csv('file:///opt/int_group/head_test', mode='append') # DataFrame[_c0: string, _c1: string, _c2: string, prob_1_str: string] ### ''' Error: Exception: Python in worker has different version 2.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set. ''' # new_result_prob1.toPandas().to_csv('file:///opt/int_group/result.csv') # new_result_prob1.toPandas().to_csv('hdfs://bcg/opt/int_group/result/result.csv') ### '''
def lr_second_evaluation(spark_session, lr_model_path, df, predict_condition, condition): """ lr二分类评估 :param spark_session: :param lr_model_path: 模型地址 :param df: 预测数据 :param predict_condition: 预测算子(父算子)配置 :param condition: 该算子配置 {"label":"标签"} :return: """ feature_indexs = predict_condition['features'] label_index = condition['label'] # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=x[label_index], features=Vectors.dense(features_data)) predict_data = df.rdd.map(lambda x: func(x)).toDF() # 2.加载模型 print("*****lr_model_path:", lr_model_path) lr_model = LogisticRegressionModel.load(lr_model_path) # 计算评估指标 result = lr_model.transform(predict_data) print(result.prediction) lrTotalCorrect = result.rdd.map(lambda r: 1 if (r.prediction == r.label) else 0).reduce(lambda x, y: x + y) lrAccuracy = lrTotalCorrect / float( predict_data.count()) # 0.5136044023234485 # # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果 lrPredictionAndLabels = result.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels) print("Area under PR = %s" % lrmetrics.areaUnderPR) print("Area under ROC = %s" % lrmetrics.areaUnderROC) # 返回数据 result = [("正确个数", float(lrTotalCorrect)), ("精准度", float(lrAccuracy)), ("Area under PR", float(lrmetrics.areaUnderPR)), ("Area under ROC", float(lrmetrics.areaUnderROC))] return spark_session.createDataFrame(result, schema=['指标', '值'])
def LinearEvaluation(data): path = 'modelo_LogisticRegression/modelLogisticRegression' lrModel = LogisticRegressionModel.load(path) #print(lrModel.coefficientMatrix) #predictions=lrModel.transform(data) predictions = lrModel.transform(data) #VERDADERO = 0 Y FALSO 1 prediccion = predictions.select( 'prediction', 'probability').rdd.flatMap(lambda x: x).collect() print(prediccion[0]) if prediccion[0] == 1.0: prediccionLabel = 'FALSO' else: prediccionLabel = 'VERDADERO' return prediccionLabel, prediccion[1][0] * 100
def testResult(lrModelPath, test_data, threshold): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) label = predictions.select('label').collect() label_list = [label[i][0] for i in range(0, len(label))] probability = predictions.select('probability').collect() prob_list = [probability[i][0][1] for i in range(0, len(probability))] # !此处取出1的概率!!! # tag flag = [] for prob in prob_list: if prob >= threshold: flag.append(float(1)) else: flag.append(float(0)) # 评测 acc = 0 for j in range(0, len(label_list)): if label_list[j] == flag[j]: acc += 1 accuracy = acc / len(label_list) print('-------accuracy--------: {}'.format(accuracy)) tp, fn, tn, fp = 0, 0, 0, 0 length = len(label_list) for i in range(0, length): if label_list[i] == 0.0 and flag[i] == 0.0: tn += 1 if label_list[i] == 1.0 and flag[i] == 1.0: tp += 1 if label_list[i] == 1.0 and flag[i] == 0.0: fn += 1 if label_list[i] == 0.0 and flag[i] == 1.0: fp += 1 # precision total = tn + tp + fn + fp print('tn:', tn) print('tp:', tp) print('fn', fn) print('fp:', fp) print('total:', total) precision = tp / (tp + fp) print('-------precision--------: {}'.format(precision)) # recall recall = tp / (tp + fn) print('-------recall--------: {}'.format(recall)) f1_score = 2 * ((precision * recall) / (precision + recall))
def classify(self, inputJson): pass sc = SparkContext() sqlContext = SQLContext(sc) schema = StructType([ StructField('Category', StringType(), True), StructField('Descript', StringType(), True), StructField('Dates', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('PdDistrict', StringType(), True), StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', DoubleType(), True), StructField('Y', DoubleType(), True) ]) test = sqlContext.createDataFrame(inputJson, schema) pipeline = PipelineModel.load( "/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint" ) testData = pipeline.transform(test) print("Test Dataset Count: " + str(testData.count())) ########################################################## ################## Train/load the model ################## ########################################################## lrModel = LogisticRegressionModel.load( "/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint" ) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 7) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) #.select("probability","label","prediction") \ resultJson = predictions.filter(predictions['prediction'] == 7) \ .select("prediction") \ .orderBy("probability", ascending=False) \ .toJSON().collect() return ["al sana ML!", resultJson]
def hello(): form = ReusableForm(request.form) print(form.errors) if request.method == 'POST': name = request.form['name'] print(name) if form.validate(): # Save the comment here. sc = SparkContext() sc.setLogLevel("ERROR") app = Flask(__name__) #Schema of the trained data schema = StructType([ StructField("_c0", StringType()), StructField("_c1", StringType()) ]) #Schema for the input features predict_schema = StructType([StructField("_c1", StringType())]) #Load the Pipeline and the Classification Model pipelineModel = PipelineModel.load("pipeline_Model") lfModel = LogisticRegressionModel.load("lr_Model") spark = SparkSession.builder.getOrCreate() input_features = [[(name)]] #Making predictions from the model predict_df = spark.createDataFrame(data=input_features, schema=predict_schema) transformed_pred_df = pipelineModel.transform(predict_df) predictions = lfModel.transform(transformed_pred_df) probs = predictions.select('probability').take(1)[0][0] n_predictions = len(probs) labels = pipelineModel.stages[-1].labels result_dict = {labels[i]: probs[i] for i in range(n_predictions)} #results = jsonify(result_dict) #displaying the predictions flash(result_dict) else: flash('All the form fields are required. ') return render_template('hello.html', form=form)
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, numFolds=4, seed=42 ) cvModel = cv.fit(dataset) lrModel = cvModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModel.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) # SPARK-32092: Saving and then loading CrossValidatorModel should not change the params cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedCvModel = CrossValidatorModel.load(cvModelPath) for param in [ lambda x: x.getNumFolds(), lambda x: x.getSeed(), lambda x: len(x.subModels) ]: self.assertEqual(param(cvModel), param(loadedCvModel)) self.assertTrue(all( loadedCvModel.isSet(param) for param in loadedCvModel.params ))
def payload(json): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sc = SparkContext.getOrCreate() modelka = mdl.load('./models/amounts.model') va = VectorAssembler( inputCols=['user', 'special', 'amount', 'percent', 'term'], outputCol="features") df = spark.read.json(sc.parallelize([json])) test = va.transform(df) pred = modelka.transform(test) approved = pred.take(1)[0][-1] spark.stop() sc.stop() return approved
def predict(): content = request.get_json(force=True) f1 = content["feature1"] f2 = content["feature2"] f3 = content["feature3"] f4 = content["feature4"] ####### Initializing a Spark Session ####### spark = SparkSession.builder.appName('abc').getOrCreate() pipelineModel = LogisticRegressionModel.load("model") data = spark.createDataFrame( [(f1, f2, f3, f4)], ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]) feature_cols = [ "SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm" ] for col in feature_cols: data = data.withColumn(col, data[col].cast(DoubleType())) assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") test = assembler.transform(data) ####### Getting prediction value ####### prediction = pipelineModel.transform(test) model_prediction = prediction.select('prediction').collect()[0][0] if model_prediction == 0.0: result = "Iris-setosa" elif model_prediction == 1.0: result = "Iris-versicolor" elif model_prediction == 2.0: result = "Iris-vifginica" print("Result: {}".format(result), file=sys.stderr) return 'OK'
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, seed=42 ) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModel.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(loadedTvsModel)) self.assertTrue(all( loadedTvsModel.isSet(param) for param in loadedTvsModel.params ))
def get_model(s3_name): model_name = s3_name + ".model.zip" print(model_name) get_file_from_bucket('models-dpa', model_name, 'aux.zip') with zipfile.ZipFile("aux.zip", 'r') as zip_ref: zip_ref.extractall("model") os.remove("aux.zip") spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() model_path = "model/" + s3_name + ".model" model = LogisticRegressionModel.load(model_path) print(model) shutil.rmtree("model", ignore_errors=True) return model
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel tvsModelPath = temp_path + "/tvsModel" lrModel.save(tvsModelPath) loadedLrModel = LogisticRegressionModel.load(tvsModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
def __init__(self, model_name, model_base_path): """ Initialize the service. Args: model_name: The name of the model. model_base_path: The file path of the model. Return: None """ super(SparkInferenceService, self).__init__() # TODO: Download the model files #local_model_base_path = filesystem_util.download_hdfs_moels( # model_base_path) self.model_name = model_name self.model_base_path = model_base_path self.model_version_list = [1] self.model_graph_signature = "" self.platform = "Spark" self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path( self.model_base_path) # Load model from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegressionModel self.spark_session = SparkSession.builder.appName("libsvm_lr").getOrCreate() # TODO: Support other model self.spark_model = LogisticRegressionModel.load(self.model_base_path) # TODO: Add signature for Spark model self.model_graph_signature = "No signature for Spark MLlib models"
# 모델 생성 알고리즘 (로지스틱 회귀 평가자) lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender") # 모델 생성 model = lr.fit(assembled_training) # 예측값 생성 model.transform(assembled_training).show() # 파이프라인 pipeline = Pipeline(stages=[assembler, lr]) # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(training).show() path1 = "/Users/beginspark/Temp/regression-model" path2 = "/Users/beginspark/Temp/pipelinemodel" # 모델 저장 model.write().overwrite().save(path1) pipelineModel.write().overwrite().save(path2) # 저장된 모델 불러오기 loadedModel = LogisticRegressionModel.load(path1) loadedPipelineModel = PipelineModel.load(path2) spark.stop
#!/usr/bin/env python from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegressionModel from pyspark.ml.linalg import SparseVector spark = SparkSession.builder.appName("libsvm_lr").getOrCreate() # Load model model_path = "./lr_model/" lrModel = LogisticRegressionModel.load(model_path) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # Construct data #testset = spark.read.format("libsvm").load("./sample_libsvm_data.txt") testset = spark.createDataFrame( [(1.0, SparseVector(692, [128, 129, 130], [51, 159, 20]))], ['label', 'features']) # Make inference result = lrModel.transform(testset) result = result.first() print("Prediction: {}, probability_of_0: {}, probability_of_1: {}".format( result.label, result.probability[0], result.probability[1]))