''' from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel('WARN') from pyspark import keyword_only from pyspark.ml import Model from pyspark.ml.param import Param, Params, TypeConverters from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol from pyspark.sql.types import * from pyspark.ml.feature import * import pyspark.sql.functions as F #from pyspark.ml import Estimator from sklearn.linear_model import LogisticRegression import pandas as pd from joblib import load, dump from pyspark.ml import Estimator from pickle import loads, dumps import base64 from pyspark.ml import Pipeline from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable from model import pipeline df = spark.read.json(sys.argv[1]) pipeline_model = pipeline.fit(df) pipeline_model.write().overwrite().save(sys.argv[2])
# # Read script arguments # try: train_path = sys.argv[1] model_path = sys.argv[2] except: logging.critical("Need to pass both train dataset path and model path") sys.exit(1) logging.info(f"TRAIN_PATH {train_path}") logging.info(f"MODEL_PATH {model_path}") # # model importing # from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel('WARN') from model import pipeline # # training # dataset = spark.read.json(train_path, multiLine=True) dataset_cleaned = dataset.select("id", "reviewText", "overall") pipeline_model = pipeline.fit(dataset_cleaned) pipeline_model.write().overwrite().save(model_path)
from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel('WARN') from pyspark.ml.linalg import Vectors from pyspark.ml.regression import LinearRegression from pyspark.sql.types import * from pyspark.ml.feature import * from pyspark.ml import Pipeline import sys from model import pipeline, sklearn_est, vectorToArray, predict import pickle train_path = sys.argv[1] pipeline_path = sys.argv[2] model_path = sys.argv[3] train = spark.read.json(train_path) pipeline_model = pipeline.fit(train) train = pipeline_model.transform(train) train = train.withColumn("features_array", vectorToArray("features")).localCheckpoint() df = train.select('label', 'features_array').toPandas() sklearn_est = sklearn_est.fit(df['features_array'].tolist(), df['label'].tolist()) pipeline_model.write().overwrite().save(pipeline_path) with open(model_path, "wb") as f: pickle.dump(sklearn_est, f)
from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel('WARN') from model import pipeline import os import sys path_to_data = sys.argv[1] model_path = sys.argv[2] # Считываем трейн data = spark.read.json(path_to_data) data = data.select('reviewText', 'overall') # Обучаем модель pipeline_model = pipeline.fit(data) # Сохраняем модель pipeline_model.write().overwrite().save(model_path)