def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map(lambda w: Row(title=w[1])) wordsDataFrame = spark.createDataFrame(rowRdd) # load model pipeline model = PipelineModel.load('kmeans') prediction = model.transform(wordsDataFrame).select("6_kmeans") prediction.show(5) except: pass
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' # File names test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' # Reading the parquet files test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') w = Window.partitionBy("user_id") def z_score(c, w): return (col(c) - mean(c).over(w)) / stddev(c).over(w) test_z = test.select("user_id", "track_id", "count", z_score("count", w).alias("count2")) test_z.createOrReplaceTempView('test_z') test = spark.sql( 'SELECT user_id, track_id, COALESCE(count2,count) AS count FROM test_z' ) test.createOrReplaceTempView('test') print('Test Z created') # Creating the train sample # All validation and test users from train, and 10% of the rest of the train train_sample = spark.read.parquet( 'hdfs:/user/dev241/extension3_zscores.parquet') print("Training sample loaded") StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') test_idx = StringIndexer.transform(test) train_idx = StringIndexer.transform(train_sample) #change to best rank = 78 alpha = 14.287069059772636 reg = 0.41772043857578584 #model als = ALS(rank=rank, alpha=alpha, regParam=reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) model = als.fit(train_idx) print("Model fit for Ext3 done") model.save("Extension3(z_score)") print("Model save for Ext3 done") #test ranking metrics test_idx = test_idx.select('user_idx', 'track_idx', 'count') test_users = test_idx.select('user_idx').distinct() test_comb = test_idx.groupBy('user_idx').agg( F.collect_set('track_idx').alias('test_labels')) track_number = 500 rec_test = model.recommendForUserSubset(test_users, track_number) join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx) predictionAndLabels = join.rdd.map(lambda r: ( [track.track_idx for track in r.recommendations], r.test_labels)) metrics = RankingMetrics(predictionAndLabels) mavgp = metrics.meanAveragePrecision print("Ext 3 Test mean Average Precision : ", mavgp) pass
from pyspark.sql.types import IntegerType, DoubleType from pyspark.ml import PipelineModel app = Flask(__name__) # get spark session spark = SparkSession.builder \ .master("local") \ .appName("Sparkify") \ .getOrCreate() # load model model = PipelineModel.load('../model/classifier') # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # get spark context sc = SparkContext.getOrCreate() # create spark dataframe to predict customer churn using the model #[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend] gender = '' level = 0
## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model, hamThreshold = 0.5): smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple prediction = model.transform(smsTextDF) return prediction.select("prediction_output.p1").first()["p1"] > hamThreshold print(isSpam("Michal, h2oworld party tonight in MV?", loaded_model))
from pyspark.sql import SparkSession from pyspark.sql import SQLContext # Boilerplate Spark stuff: spark = SparkSession\ .builder\ .appName("diabetes_patient_readmission_onlinescore")\ .getOrCreate() sc = spark.sparkContext #Load up patient discharge file, use the trained model and predict readmissions test = spark.table("diabetic_data_original_parquet") modelPath = "hdfs:///tmp/diabetes/diabetic_data_model/" from pyspark.ml import PipelineModel sameModel = PipelineModel.load(modelPath) predictions = sameModel.transform(test) predictions.select('encounter_id','patient_nbr','prediction').filter('prediction = 1').show(50)
""" 验证模型 """ start=time.time() # 再次对测试集的数据进行词转向量的转化 test_set = model.transform(test_set) # 再次将多列数据转化为单列的向量列(决策树可以识别的类型) # test_set = assembler.transform(test_set) # 再次使用cv_pipelineModel进行验证,把在pipeline中的所有transform都执行一遍(???) bestDt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="entropy", maxDepth=bestModel.depth, maxBins=32) dt_pipeline = Pipeline(stages=[assembler, bestDt]) # (???) dt_model = dt_pipeline.fit(manbing) dt_model.write().overwrite().save("./models/courage_dtmodel/") sameDTModel = PipelineModel.load("./models/courage_dtmodel/") predictions = sameDTModel.transform(test_set) # (???) # 使用评估器对预测结果进行评估得到auc auc = evaluator.evaluate(predictions) # (???) print("auc="+str(auc)) acc = predictions.filter(predictions['label'] == predictions['prediction']).count() / float(predictions.count()) print("acc="+str(acc)) end = time.time() print("预测用时:{}".format(end-start)) """ auc=0.9834579598810581 acc=0.9598010774968918 """
from pyspark.ml import PipelineModel newsModel = PipelineModel.load( "hdfs://localhost:19000/user/Waseem/bestPipeline") sportsModel = PipelineModel.load( "hdfs://localhost:19000/user/Waseem/sportsPipeline") model = [newsModel, sportsModel]
sentenceDataFrame = spark.createDataFrame([(0, 1, 2), (0, 1, 2), (1, 1, 2)], ["label", "a", 'b']) from pyspark.sql import functions df = sentenceDataFrame.withColumn('c', functions.lit(np.nan)) df.show() ############# #测试pipleline ############# from pyspark.ml import Pipeline, PipelineModel, Transformer blankTransformer = BlankTransformer(inputCols=["a", "b", "c"], outputCols=["a_1", "b_1", "c_1"]) p = Pipeline(stages=[blankTransformer]) # df = spark.sparkContext.parallelize([(1, None), (2, 1.0), (3, 0.5)]).toDF(["key", "value"]) pm = p.fit(df) pm.transform(df).show() ########################### #测试保存piplemodel,和加载测试 ############################ pm.write().overwrite().save('./test/test.model') pm2 = PipelineModel.load('./test/test.model') print('matches?', pm2.stages[0].extractParamMap() == pm.stages[0].extractParamMap()) print(pm2.stages[0].extractParamMap()) pm2.transform(df).show()
def transform(df, model_path, prediction_column): pipeline_model = PipelineModel.load(model_path) predictions = pipeline_model.transform(df) predictions = predictions.drop(*["features", "rawPrediction", "probability", "categorical_features", "continuous_features", "continuous_vector"]) predictions = predictions.withColumnRenamed("prediction", prediction_column) return predictions
from pyspark.ml import PipelineModel import pyspark.sql.functions as F pipelinePath = "./LDA-pipeline-model" pipeline_model = PipelineModel.load(pipelinePath) # 5. check the topic distribution among dataset df_with_topics = pipeline_model.transform(df).select("tweet_text", "topicDistribution") to_array = F.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) df_with_topics_toArray = df_with_topics.select( "tweet_text", to_array("topicDistribution").alias("topicDistributionArray")) df_with_topics_final = df_with_topics_toArray.select( "tweet_text" + [(F.col("topicDistributionArray")[i]).alias("topic_" + str(i)) for i in range(10)]) df_with_topics_final.agg( *[F.sum(F.col("topic_" + str(i) for i in range(n_topics)))])
# Guardamos version de lmp_version dataset = dataset.withColumn('lmp_version_split', F.split(F.col('lmp_version'), "-").getItem(0)) # Separamos nap, uap y lap de la columna address dataset = dataset.withColumn('nap', dataset.address.substr(1, 5)) dataset = dataset.withColumn('uap', dataset.uap_lap.substr(1, 2)) dataset = dataset.withColumn('lap', dataset.uap_lap.substr(4, 11)) # StringIndexer string_indexer_model_path = "{}/data/stringIndexerModel.bin".format(base_path) string_indexer = PipelineModel.load(string_indexer_model_path) dataset = string_indexer.transform(dataset) # MinMaxScaler minMaxScaler_model_path = "{}/data/minMaxScalerModel.bin".format(base_path) minMaxScaler = PipelineModel.load(minMaxScaler_model_path) dataset = minMaxScaler.transform(dataset) # OneHotEncoding ohe_model_path = "{}/data/oheModel.bin".format(base_path) ohe = PipelineModel.load(ohe_model_path) dataset = ohe.transform(dataset) # VectorAssembler
import subprocess from pyspark.mllib.evaluation import MulticlassMetrics from elasticsearch import Elasticsearch es=Elasticsearch([{'host':'localhost','port':9200}]) indexName="twitternb" indexName2="twitterlr" typeName1="NaiveB" typeName2="logisticR" # import pyspark.sql.Row # import pyspark.implicits._ sc =SparkContext() sqlContext = SQLContext(sc) consumer = KafkaConsumer('twitter', group_id='my-group', bootstrap_servers=['localhost:9092']) nbModel = PipelineModel.load("APJ180001_nb.model") lrModel = PipelineModel.load("APJ180001_lr.model") evaluator = MulticlassClassificationEvaluator() count=0; sum=0; avg=0; sum2=0 count2=0 labels={} index = 0 accidentalTweetsNB = open("accidentalTweetsNB.txt", 'a+') accidentalTweetsLR = open("accidentalTweetsLR.txt", 'a+') for message in consumer:
def construct_component_from_pipe_identifier( language, nlp_ref, nlu_ref, path=None, is_licensed=False): # -> NLUPipeline """ creates a list of components from a Spark NLP Pipeline reference 1. download pipeline 2. unpack pipeline to annotators and create list of nlu components 3. return list of nlu components :param is_licensed: Weather pipe is licensed or not :param nlu_ref: Nlu ref that points to this pipe :param language: language of the pipeline :param nlp_ref: Reference to a spark nlp pretrained pipeline :param path: Load component_list from HDD :return: Each element of the Spark NLP pipeline wrapped as a NLU component inside a list """ if 'language' in nlp_ref: # special edge case for lang detectors language = 'xx' if path is None: if is_licensed: pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models') else: pipe = PretrainedPipeline(nlp_ref, lang=language) iterable_stages = pipe.light_model.pipeline_model.stages else: pipe = LightPipeline(PipelineModel.load(path=path)) iterable_stages = pipe.pipeline_model.stages constructed_components = [] os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict() hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict() ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict() for jsl_anno_object in iterable_stages: anno_class_name = type(jsl_anno_object).__name__ logger.info( f"Extracting model from Spark NLP pipeline: obj= {jsl_anno_object} class_name = {anno_class_name} and creating Component" ) if anno_class_name in os_annos.keys(): jsl_anno_id = os_annos[anno_class_name] nlu_component = ComponentMap.os_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.open_source) constructed_components.append(nlu_component) elif anno_class_name in hc_annos.keys(): # Licensed HC jsl_anno_id = hc_annos[anno_class_name] nlu_component = ComponentMap.hc_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.hc) constructed_components.append(nlu_component) elif anno_class_name in ocr_annos: # Licensed OCR (WIP) jsl_anno_id = ocr_annos[anno_class_name] nlu_component = ComponentMap.ocr_components[jsl_anno_id] nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, Licenses.ocr) constructed_components.append(nlu_component) else: raise ValueError( f'Could not find matching nlu component for annotator class = {anno_class_name}' ) if None in constructed_components or len(constructed_components) == 0: raise Exception( f"Failure inferring type anno_class={anno_class_name} ") return ComponentUtils.set_storage_ref_attribute_of_embedding_converters( PipeUtils.set_column_values_on_components_from_pretrained_pipe( constructed_components, nlp_ref, language, path))
run_mode = 'batch' # 'batch' or 'streaming' outputs = 'outputs.json' ################################################# if len(sys.argv) != 2: print("Usage: <models folder> ") exit(1) models_folder = sys.argv[1] spark = SparkSession.builder.master('local[{}]'.format(spark_threads)).appName( 'local-testing-pyspark-context').getOrCreate() # Load the model print("Loading model...") model = PipelineModel.load(models_folder + '/' + ml_model_name) print("Done.") #with open(models_folder + '/' + input_dtypes, 'r') as f: # types = {x:np.dtype(y) for x, y in json.load(f).items()} def read_data(): print("Reading data") df = pd.read_json(stage2_outputs) if sample_size: df = df.sample(sample_size) print("Size of test data is : ", len(df)) return df
header=True, maxFilesPerTrigger=1)\ .load() # если в числовом поле LotFrontage есть null, всю строку записывает нулевой # поэтому задаем в схеме строчный тип, потом меняем на Float и заменяем null-значения # визуально нет Id c null-значением, откуда потом вылетают - не понимаю test = test_start\ .withColumn("LotFrontage", F.expr("CAST(LotFrontage as FLOAT)"))\ .na.fill({"LotFrontage": 60.0, "Id": 0}) out = console_output(test.select("Id", "LotFrontage", "LotArea"), 100) out.stop # довольно долго подгружается pipeline_model = PipelineModel.load("my_GB_model8_ob") """ /cassandra/bin/cqlsh 10.0.0.18 — запуск #создать схему #CREATE KEYSPACE lesson8 # WITH REPLICATION = { # 'class' : 'SimpleStrategy', 'replication_factor' : 1 } ; use lesson8; DROP TABLE houses_price_prediction; CREATE TABLE IF NOT EXISTS houses_price_prediction (Id int primary key, SalePrice int); """
def main(sentiment_input, user_input, review_input, model_input, output_folder): # read input files df_sentiment = spark.read.csv(sentiment_input, header=True) df_user = spark.read.parquet(user_input) df_review = spark.read.parquet(review_input) # get 50 users df_50_users = df_user.limit(50) # cross join user and business df_usr_bus_all = df_50_users \ .crossJoin(df_sentiment) \ .where(df_sentiment['ZipCode'].isNull() == False) \ .select( df_sentiment['BusinessID'], \ df_user['UserID'], \ df_user['UserName'], \ df_user['ReviewCount'].alias('UserReviewCount'), \ df_user['AverageStars'].alias('UserAverageStars'), \ functions.lit(0).alias('ReviewStars'), \ functions.dayofyear(functions.current_date()).alias('ReviewDayOfYear'), \ df_sentiment['Name'].alias('BusinessName'), \ df_sentiment['ZipCode'].alias('BusinessPostalCode'), \ df_sentiment['ZipCode'].substr(1, 3).alias('BusinessNeighborhood'), \ df_sentiment['Latitude'].cast(types.FloatType()), \ df_sentiment['Longitude'].cast(types.FloatType()), \ df_sentiment['avg_neg'].cast(types.FloatType()).alias('AverageNegative'), \ df_sentiment['avg_neu'].cast(types.FloatType()).alias('AverageNeutral'), \ df_sentiment['avg_pos'].cast(types.FloatType()).alias('AveragePositive'), \ df_sentiment['avg_composite_score'].cast(types.FloatType()).alias('AverageComposite')) # left join with reviews df_joined = df_usr_bus_all.join(df_review, ['BusinessID', 'UserID'], 'left_outer') \ .select(df_review['ReviewID'], \ df_usr_bus_all['BusinessID'], \ df_usr_bus_all['UserID'], \ df_usr_bus_all['UserName'], \ df_usr_bus_all['UserReviewCount'], \ df_usr_bus_all['UserAverageStars'], \ df_usr_bus_all['ReviewStars'], \ df_usr_bus_all['ReviewDayOfYear'], \ df_usr_bus_all['BusinessName'], \ df_usr_bus_all['BusinessPostalCode'], \ df_usr_bus_all['BusinessNeighborhood'], \ df_usr_bus_all['Latitude'], \ df_usr_bus_all['Longitude'], \ df_usr_bus_all['AverageNegative'], \ df_usr_bus_all['AverageNeutral'], \ df_usr_bus_all['AveragePositive'], \ df_usr_bus_all['AverageComposite']) # get restaurants that user has not visited df_not_visited_rests = df_joined.where(df_joined['ReviewID'].isNull()) # load the model loaded_model = PipelineModel.load(model_input) # use the model to make predictions predictions = loaded_model.transform(df_not_visited_rests) predictions_init = predictions.select(predictions['BusinessID'], \ predictions['BusinessName'], \ predictions['BusinessPostalCode'], \ predictions['BusinessNeighborhood'], \ predictions['UserID'], \ predictions['UserName'], \ predictions['UserReviewCount'], \ predictions['UserAverageStars'], \ predictions['ReviewDayOfYear'], \ predictions['prediction'].alias('PredictedReviewStar'), \ predictions['Latitude'], \ predictions['Longitude'], \ predictions['AverageNegative'], \ predictions['AverageNeutral'], \ predictions['AveragePositive'], \ predictions['AverageComposite']) # change scores > 5 to 5 and < 0 to 0 predictions_final = predictions_init.withColumn('FinalStar', \ functions.when(predictions_init["PredictedReviewStar"] >= 5, 5) \ .otherwise(functions.when(predictions_init["PredictedReviewStar"] <= 0, 0) \ .otherwise(predictions_init['PredictedReviewStar']))) # partition By user window = Window.partitionBy(predictions_final['UserID']).orderBy( predictions_final['FinalStar'].desc()) # get top 10 scores for each user based on partition prediction_to_save = predictions_final.select( '*', functions.row_number().over(window).alias('rank')).filter( col('rank') <= 10) # save predictions to output prediction_to_save.coalesce(1).write.csv(output_folder + '/TestModel', header=True)
from pyspark import SparkContext from pyspark.ml import PipelineModel from pyspark.sql import SQLContext from pyspark.sql.types import StructField, StructType, StringType, IntegerType from nltk.corpus import stopwords if __name__ == '__main__': sc = SparkContext.getOrCreate() pipeline = PipelineModel.load('hdfs:///model_lr') file_path = '/data/twitter_data/z_sample.csv' # create a spark context sc = SparkContext.getOrCreate() # create a sql spark context sql = SQLContext(sc) # defining a schema for the data schema = StructType([ StructField('polarity', IntegerType(), True), StructField('id', StringType(), True), StructField('date', StringType(), True), StructField('query', StringType(), True), StructField('user', StringType(), True), StructField('text', StringType(), True) ]) useless_columns = ['id', 'date', 'query', 'user']
def main(file1, file2, input_model, u_id, sim_bus_limit=3): data = spark.read.parquet(file1) data.createOrReplaceTempView('review') df_business = spark.read.parquet(file2) schema = StructType([ StructField("business_id", StringType(), True), StructField("score", IntegerType(), True), StructField("input_business_id", StringType(), True) ]) similar_businesses_df = spark.createDataFrame([], schema) df = data.select('business_id', 'text') #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100) review_rdd = df.rdd.map(tuple).reduceByKey(operator.add) review_df = spark.createDataFrame(review_rdd).withColumnRenamed( '_1', 'business_id').withColumnRenamed('_2', 'text') # create text preprocessing pipeline # Build the pipeline # tokenize review regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol='text', outputCol='text_token') #yelpTokenDF = regexTokenizer.transform(review_df) # filter stopwords stopWordsRemover = StopWordsRemover(inputCol='text_token', outputCol='nonstopwrd') #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF) # TF countVectorizer = CountVectorizer(inputCol='nonstopwrd', outputCol='raw_features', minDF=2) #yelp_CountVec = cv.transform(yelp_remove_df) # IDF idf = IDF(inputCol="raw_features", outputCol="idf_vec") word2Vec = Word2Vec(vectorSize=500, minCount=5, inputCol='nonstopwrd', outputCol='word_vec', seed=123) #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec') pipeline = Pipeline(stages=[ regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec ]) #pipeline_model = pipeline.fit(review_df) #pipeline_model.write().overwrite().save('content_userid') pipeline_model = PipelineModel.load(input_model) reviews_by_business_df = pipeline_model.transform(review_df) all_business_vecs = reviews_by_business_df.select( 'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect() usr_rev_bus = spark.sql( 'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"' .format(u_id)) bus_list = [i for i in usr_rev_bus.collect()] for b_id in bus_list: input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0] similar_business_rdd = sc.parallelize( (i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs) similar_business_df = spark.createDataFrame( similar_business_rdd).withColumnRenamed( '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy( "score", ascending=False) similar_business_df = similar_business_df.filter( col("business_id") != b_id[0]).limit(10) similar_business_df = similar_business_df.withColumn( 'input_business_id', lit(b_id[0])) # get restaurants similar to the user_id result = similar_businesses_df.union(similar_business_df) result.cache() # filter out those have been reviewd before by the user d = [i[0] for i in usr_rev_bus.collect()] df_1 = result.filter(~(col('business_id').isin(d))).select( 'business_id', 'score') #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')]) df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit) df_result = df_business.join(df_2, 'business_id', 'right').select('business_id', 'score', 'name', 'categories', 'latitude', 'longitude') df_result.show()
from pyspark.sql.types import IntegerType, DoubleType from pyspark.ml import PipelineModel, Pipeline from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField, SelectField, FloatField from wtforms.widgets import html5 app = Flask(__name__) # Initiate SparkSession spark = SparkSession.builder \ .master('local') \ .appName('sparkify') \ .getOrCreate() # load model model_gbt = PipelineModel.load("../model/sparkify_model") # Load dataframe df_ML = spark.read.parquet("../model/sparkify.parquet") df_pd = df_ML.toPandas() location_list = df_pd.location_first.unique().tolist() location_list = [ (location,location) for location in location_list] # Graph 1 graph1 = df_pd.groupby('gender')['churn'].value_counts(normalize=True).unstack() g1_name1 = str(graph1.columns[0]) g1_name2 = str(graph1.columns[1]) g1_x = graph1.index g1_y1 = graph1.values.T[0] g1_y2 = graph1.values.T[1] # Graph 2
import numpy as np from flask import Flask, request, abort from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.functions import udf from pyspark.ml import PipelineModel from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.ml.feature import StringIndexerModel app = Flask(__name__) sc = SparkContext('local') sqlContext = SQLContext(sc) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) lr_model = PipelineModel.load('/spark_models/lr_model') bc_model = PipelineModel.load('/spark_models/bc_model') mc_model = PipelineModel.load('/spark_models/mc_model') # https://stackoverflow.com/questions/45885044/getting-labels-from-stringindexer-stages-within-pipeline-in-spark-pyspark mc_classes = classes = { x._java_obj.getOutputCol(): x.labels for x in mc_model.stages if isinstance(x, StringIndexerModel) } mc_classes = mc_classes['label'] @app.route('/lr') def linear_regression(): try: df = sqlContext.createDataFrame(
def load_lda_model(spark): register_remove_punctuation_udf(spark) ldaPipelineModel = PipelineModel.load( "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel") #ldaPipelineModel.stages[0] = SQLTransformer(statement="SELECT jokeID, clean_text_udf(raw_text) text FROM __THIS__") return ldaPipelineModel
def load_model(building_id, meter): model_path = "output/als_model_{0}_{1}".format(building_id, meter) return PipelineModel.load(model_path)
def cargar_juez(path, tipo, mongo_uri=None): if tipo == 1 and mongo_uri: df = spark_session().read.json(path + "_trainingset") df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri) return PipelineModel.load(path)
lambda x: datetime.strftime( datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S' ) ) df = df.withColumn("created_at", date_process(df.created_at)) ################# Pre-processing the data pre_process = udf( lambda x: re.sub(r'[^A-Za-z\n ]|(http\S+)|(www.\S+)', '', \ x.lower().strip()).split(), ArrayType(StringType()) ) df = df.withColumn("cleaned_data", pre_process(df.message)).dropna() ################# Passing into ml pipeline model_path = SRC_DIR.joinpath('models') pipeline_model = PipelineModel.load(model_path) prediction = pipeline_model.transform(df) ''' The labels are labelled with positive (4) as 0.0 negative (0) as 1.0 ''' prediction = prediction \ .select(prediction.cleaned_data, prediction.created_at, \ prediction.timestamp, prediction.message, prediction.prediction) # print(prediction.schema) ################# Write to Delta
from flask import Flask, jsonify, render_template, request from pyspark.sql import SparkSession from pyspark.ml import Pipeline, PipelineModel import json MASTER = 'local' APPNAME = 'simple-ml-serving' MODEL_PATH = 'file:///home/cdsw/cdsw-simple-serving-python/model/spark-model' spark = SparkSession.builder.master(MASTER).appName(APPNAME).getOrCreate() model = PipelineModel.load(MODEL_PATH) def classify(input): #target_columns = input.columns + ["prediction"] target_columns = ["prediction"] return model.transform(input).select(target_columns).collect() # webapp app = Flask(__name__) @app.route('/api/predict', methods=['POST']) def predict(): input_df = spark.sparkContext.parallelize([request.json]).toDF() output = classify(input_df) return jsonify(input=request.json, prediction=output) @app.route('/')
from pyspark.sql import SparkSession from pyspark.sql.types import * from pyspark.ml import PipelineModel from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder \ .appName("Telco Customer Churn") \ .master("local[*]") \ .getOrCreate() model = PipelineModel.load("models/spark/mlp") features = ["intl_plan", "account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_night_charge", "total_intl_calls", "total_intl_charge","number_customer_service_calls"] def predict(args): account=args["feature"].split(",") feature = spark.createDataFrame([account[:1] + list(map(float,account[1:12]))], features) result=model.transform(feature).collect()[0].prediction return {"result" : result} #features = ["intl_plan_indexed","account_length", "number_vmail_messages", "total_day_calls", # "total_day_charge", "total_eve_calls", "total_eve_charge", # "total_night_calls", "total_night_charge", "total_intl_calls", # "total_intl_charge","number_customer_service_calls" predict({ "feature": "no, 128, 25, 256, 110, 197.4, 50, 244.7, 91, 10, 5, 1" })
print(str(edate)) # paragraph 3 - send start email send_mail( "Email Cadence Model Scoring Starting", "*****@*****.**", "Email Cadence Model Scoring: " + str(today), ) # paragraph 4 - loading model from file try: # load model from exported file lrModel = PipelineModel.load( "/user/datascience/test_db.db/features/lrModel") print("Model Loaded") except Exception, e: print(str(e)) errormsg = errormsg + "Model Failed Loading\n" # paragraph 5 - data config ###SCORING print("Scoring") try: ##change date as parameter to take in current date (or date of score) scoring_query = "select * from test.features where ds = '" + str( edate) + "'"
## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model): smsTextDF = spark.createDataFrame([(smsText, )], ["text"]) # create one element tuple prediction = model.transform(smsTextDF) return prediction.select("prediction").first()["prediction"] == "spam" isSpamMsg = isSpam("Michal, h2oworld party tonight in MV?", loaded_model) assert not isSpamMsg
# finding spark #import findspark #findspark.init('/home/pfcor/spark-2.1.0-bin-hadoop2.7') # misc import datetime as dt timestamp = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d') # init from pyspark.context import SparkContext, SparkConf from pyspark.sql import SparkSession spark = SparkSession.builder.appName("BANK_MODELO").getOrCreate() # carregando modelo from pyspark.ml import PipelineModel pipelineModel = PipelineModel.load( 'hdfs://elephant:8020/user/labdata/model/bank-pipeline-model-res/') #pipelineModel = PipelineModel.load('model/bank-pipeline-model-res/') # carregando dados #data = spark.read.csv( # 'data/new-data.csv', # sep=';', # header=True, # inferSchema=True #) data = spark.read.csv("hdfs://elephant:8020/user/labdata/new-data.csv", header=True, sep=";", inferSchema=True) data = data.selectExpr( *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])
def getSparkSessionInstance(sparkConf): if ('sparkSessionSingletonInstance' not in globals()): globals()['sparkSessionSingletonInstance'] = SparkSession\ .builder\ .config(conf=sparkConf)\ .getOrCreate() return globals()['sparkSessionSingletonInstance'] if __name__ == "__main__": # Spark Context sc = SparkContext("local[2]",appName = "StreamingReviews") sc.setLogLevel("ERROR") # Update Stream every 10 seconds ssc = StreamingContext(sc,10) # Load Model lr_model = PipelineModel.load('./Model') #Create DStream from data source lines = ssc.textFileStream('./Test') #Transformations and actions on DStream text = lines.map(lambda x: x[1:-1]) def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Remove Header head = rdd.first() rdd = rdd.filter(lambda x: x != head) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map(lambda w: Row(text=w.encode('utf-8'))) # Create new Data Frame
def infant_survival_ml(): spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # Create a pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(births_train) test_model = model.transform(births_test) print(test_model.take(1)) # Evaluate the performance of the model. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # Save the Pipeline definition. pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # Load the Pipeline definition. loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # Save the PipelineModel. modelPath = './infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) # Load the PipelineModel. loadedPipelineModel = PipelineModel.load(modelPath) test_reloadedModel = loadedPipelineModel.transform(births_test) print(test_reloadedModel.take(1))
# spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider") spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.cn-northwest-1.amazonaws.com.cn") return spark if __name__ == '__main__': spark = prepare() # 1. load the data df_result = load_training_data(spark) df_validate = df_result #.select("id", "label", "features").orderBy("id") # 2. load model model = PipelineModel.load("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/rf") # 3. compute accuracy on the test set predictions = model.transform(df_validate) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) print("Test set accuracy = " + str(accuracy)) # 4. Test with Pharbers defined methods result = predictions # result.printSchema() result = result.withColumn("JACCARD_DISTANCE_MOLE_NAME", result.JACCARD_DISTANCE[0]) \ .withColumn("JACCARD_DISTANCE_DOSAGE", result.JACCARD_DISTANCE[1]) \ .drop("JACCARD_DISTANCE", "features", "indexedFeatures").drop("rawPrediction", "probability") # result.orderBy("id").repartition(1).write.mode("overwrite").csv("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/result")
# Run App # ################################################################################################ if __name__ == "__main__": from pyspark.sql import SparkSession from pyspark.ml import PipelineModel spark = SparkSession \ .builder \ .config("spark.driver.allowMultipleContexts", "true") \ .appName("pyspark_nfl_app") \ .getOrCreate() model_pass = PipelineModel.load('/assets/static/assets/nfl_model_pass') model_run = PipelineModel.load('/assets/static/assets/nfl_model_run') #model_pass = PipelineModel.load('./static/assets/nfl_model_pass') #model_run = PipelineModel.load('./static/assets/nfl_model_run') #app.run(debug=True, threaded=False, host='0.0.0.0', port=4444) app.run(threaded=False, host='0.0.0.0', port=4444) ''' 0 Date 1 GameID 2 Drive 3 qtr