Python PipelineModel.load Exemples, pyspark.ml.PipelineModel.load Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tests.py Projet : Bella-Lin/spark

    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass

Exemple #2

0

Afficher le fichier

Fichier : sql_streaming.py Projet : ohliumliu/flash_deals_c9

    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(title=w[1]))
            wordsDataFrame = spark.createDataFrame(rowRdd)


            
            # load model pipeline
            model = PipelineModel.load('kmeans')
            prediction = model.transform(wordsDataFrame).select("6_kmeans")
            prediction.show(5)
        except:
            pass

Exemple #3

0

Afficher le fichier

Fichier : tests.py Projet : nampham2/spark

    def test_pipeline_persistence(self):
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            pl = Pipeline(stages=[tf, pca])
            model = pl.fit(df)
            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self.assertEqual(loaded_pipeline.uid, pl.uid)
            self.assertEqual(len(loaded_pipeline.getStages()), 2)

            [loaded_tf, loaded_pca] = loaded_pipeline.getStages()
            self.assertIsInstance(loaded_tf, HashingTF)
            self.assertEqual(loaded_tf.uid, tf.uid)
            param = loaded_tf.getParam("numFeatures")
            self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param))

            self.assertIsInstance(loaded_pca, PCA)
            self.assertEqual(loaded_pca.uid, pca.uid)
            self.assertEqual(loaded_pca.getK(), pca.getK())

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            [model_tf, model_pca] = model.stages
            [loaded_model_tf, loaded_model_pca] = loaded_model.stages
            self.assertEqual(model_tf.uid, loaded_model_tf.uid)
            self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param))

            self.assertEqual(model_pca.uid, loaded_model_pca.uid)
            self.assertEqual(model_pca.pc, loaded_model_pca.pc)
            self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass

Exemple #4

0

Afficher le fichier

def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''

    # File names
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    # Reading the parquet files
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    w = Window.partitionBy("user_id")

    def z_score(c, w):
        return (col(c) - mean(c).over(w)) / stddev(c).over(w)

    test_z = test.select("user_id", "track_id", "count",
                         z_score("count", w).alias("count2"))
    test_z.createOrReplaceTempView('test_z')
    test = spark.sql(
        'SELECT user_id, track_id, COALESCE(count2,count) AS count FROM test_z'
    )
    test.createOrReplaceTempView('test')
    print('Test Z created')

    # Creating the train sample
    # All validation and test users from train, and 10% of the rest of the train

    train_sample = spark.read.parquet(
        'hdfs:/user/dev241/extension3_zscores.parquet')
    print("Training sample loaded")

    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)

    #change to best
    rank = 78
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    #model
    als = ALS(rank=rank,
              alpha=alpha,
              regParam=reg,
              userCol="user_idx",
              itemCol="track_idx",
              ratingCol="count",
              coldStartStrategy="drop",
              implicitPrefs=True)
    model = als.fit(train_idx)
    print("Model fit for Ext3 done")
    model.save("Extension3(z_score)")
    print("Model save for Ext3 done")

    #test ranking metrics
    test_idx = test_idx.select('user_idx', 'track_idx', 'count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(
        F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = model.recommendForUserSubset(test_users, track_number)
    join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx)
    predictionAndLabels = join.rdd.map(lambda r: (
        [track.track_idx for track in r.recommendations], r.test_labels))
    metrics = RankingMetrics(predictionAndLabels)
    mavgp = metrics.meanAveragePrecision
    print("Ext 3 Test mean Average Precision : ", mavgp)
    pass

Exemple #5

0

Afficher le fichier

from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml import PipelineModel

app = Flask(__name__)




# get spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Sparkify") \
    .getOrCreate()

# load model
model = PipelineModel.load('../model/classifier')


# index webpage displays cool visuals and receives user input text for model
@app.route('/')
@app.route('/index')
def index():
    
    # extract data needed for visuals
     # get spark context
    sc = SparkContext.getOrCreate()
    
    # create spark dataframe to predict customer churn using the model
    #[gender, level, days_active, location, avgSongs, avgEvents, thumbsup, thumbsdown, add_friend]
    gender = ''
    level = 0

Exemple #6

0

Afficher le fichier

Fichier : ham_or_spam_multi_algo.py Projet : h2oai/sparkling-water

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")




##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold = 0.5):
    smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple
    prediction = model.transform(smsTextDF)
    return prediction.select("prediction_output.p1").first()["p1"] > hamThreshold


print(isSpam("Michal, h2oworld party tonight in MV?", loaded_model))

Exemple #7

0

Afficher le fichier

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# Boilerplate Spark stuff:
spark = SparkSession\
    .builder\
    .appName("diabetes_patient_readmission_onlinescore")\
    .getOrCreate()
sc = spark.sparkContext



#Load up patient discharge file, use the trained model and predict readmissions
test = spark.table("diabetic_data_original_parquet")
modelPath = "hdfs:///tmp/diabetes/diabetic_data_model/"


from pyspark.ml import PipelineModel
sameModel = PipelineModel.load(modelPath)
predictions = sameModel.transform(test)
predictions.select('encounter_id','patient_nbr','prediction').filter('prediction = 1').show(50)

Exemple #8

0

Afficher le fichier

Fichier : chronic_courage.py Projet : BruceCobb/ChronicDiseasePrediction

"""
验证模型
"""

start=time.time()
# 再次对测试集的数据进行词转向量的转化
test_set = model.transform(test_set)
# 再次将多列数据转化为单列的向量列(决策树可以识别的类型)
# test_set = assembler.transform(test_set)
# 再次使用cv_pipelineModel进行验证,把在pipeline中的所有transform都执行一遍（？？？）
bestDt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="entropy", maxDepth=bestModel.depth, maxBins=32)
dt_pipeline = Pipeline(stages=[assembler, bestDt])  # (???)
dt_model = dt_pipeline.fit(manbing)
dt_model.write().overwrite().save("./models/courage_dtmodel/")
sameDTModel = PipelineModel.load("./models/courage_dtmodel/")

predictions = sameDTModel.transform(test_set)  # （？？？）
# 使用评估器对预测结果进行评估得到auc
auc = evaluator.evaluate(predictions)  # （？？？）
print("auc="+str(auc))
acc = predictions.filter(predictions['label'] == predictions['prediction']).count() / float(predictions.count())
print("acc="+str(acc))
end = time.time()
print("预测用时：{}".format(end-start))

"""
auc=0.9834579598810581
acc=0.9598010774968918
"""

Exemple #9

0

Afficher le fichier

Fichier : classification.py Projet : Waseem-678/News_Analysis

from pyspark.ml import PipelineModel

newsModel = PipelineModel.load(
    "hdfs://localhost:19000/user/Waseem/bestPipeline")
sportsModel = PipelineModel.load(
    "hdfs://localhost:19000/user/Waseem/sportsPipeline")
model = [newsModel, sportsModel]

Exemple #10

0

Afficher le fichier

sentenceDataFrame = spark.createDataFrame([(0, 1, 2), (0, 1, 2), (1, 1, 2)],
                                          ["label", "a", 'b'])
from pyspark.sql import functions

df = sentenceDataFrame.withColumn('c', functions.lit(np.nan))
df.show()

#############
#测试pipleline
#############
from pyspark.ml import Pipeline, PipelineModel, Transformer

blankTransformer = BlankTransformer(inputCols=["a", "b", "c"],
                                    outputCols=["a_1", "b_1", "c_1"])

p = Pipeline(stages=[blankTransformer])
# df = spark.sparkContext.parallelize([(1, None), (2, 1.0), (3, 0.5)]).toDF(["key", "value"])
pm = p.fit(df)
pm.transform(df).show()

###########################
#测试保存piplemodel,和加载测试
############################
pm.write().overwrite().save('./test/test.model')
pm2 = PipelineModel.load('./test/test.model')
print('matches?',
      pm2.stages[0].extractParamMap() == pm.stages[0].extractParamMap())
print(pm2.stages[0].extractParamMap())
pm2.transform(df).show()

Exemple #11

0

Afficher le fichier

Fichier : ensemble_submittal.py Projet : jmbowles/kaggle-malware-prediction

def transform(df, model_path, prediction_column):
	pipeline_model = PipelineModel.load(model_path)
	predictions = pipeline_model.transform(df)
	predictions = predictions.drop(*["features", "rawPrediction", "probability", "categorical_features", "continuous_features", "continuous_vector"])
	predictions = predictions.withColumnRenamed("prediction", prediction_column)
	return predictions

Exemple #12

0

Afficher le fichier

Fichier : data_topics_count.py Projet : eliekawerk/twitter_data_analysis

from pyspark.ml import PipelineModel
import pyspark.sql.functions as F

pipelinePath = "./LDA-pipeline-model"

pipeline_model = PipelineModel.load(pipelinePath)

# 5. check the topic distribution among dataset
df_with_topics = pipeline_model.transform(df).select("tweet_text",
                                                     "topicDistribution")
to_array = F.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
df_with_topics_toArray = df_with_topics.select(
    "tweet_text",
    to_array("topicDistribution").alias("topicDistributionArray"))

df_with_topics_final = df_with_topics_toArray.select(
    "tweet_text" +
    [(F.col("topicDistributionArray")[i]).alias("topic_" + str(i))
     for i in range(10)])

df_with_topics_final.agg(
    *[F.sum(F.col("topic_" + str(i) for i in range(n_topics)))])

Exemple #13

0

Afficher le fichier

# Guardamos version de lmp_version

dataset = dataset.withColumn('lmp_version_split',
                             F.split(F.col('lmp_version'), "-").getItem(0))

# Separamos nap, uap y lap de la columna address

dataset = dataset.withColumn('nap', dataset.address.substr(1, 5))
dataset = dataset.withColumn('uap', dataset.uap_lap.substr(1, 2))
dataset = dataset.withColumn('lap', dataset.uap_lap.substr(4, 11))

# StringIndexer

string_indexer_model_path = "{}/data/stringIndexerModel.bin".format(base_path)
string_indexer = PipelineModel.load(string_indexer_model_path)
dataset = string_indexer.transform(dataset)

# MinMaxScaler

minMaxScaler_model_path = "{}/data/minMaxScalerModel.bin".format(base_path)
minMaxScaler = PipelineModel.load(minMaxScaler_model_path)
dataset = minMaxScaler.transform(dataset)

# OneHotEncoding

ohe_model_path = "{}/data/oheModel.bin".format(base_path)
ohe = PipelineModel.load(ohe_model_path)
dataset = ohe.transform(dataset)

# VectorAssembler

Exemple #14

0

Afficher le fichier

import subprocess
from pyspark.mllib.evaluation import MulticlassMetrics
from elasticsearch import Elasticsearch
es=Elasticsearch([{'host':'localhost','port':9200}])
indexName="twitternb"
indexName2="twitterlr"
typeName1="NaiveB"
typeName2="logisticR"
# import pyspark.sql.Row
# import pyspark.implicits._
sc =SparkContext()
sqlContext = SQLContext(sc)
consumer = KafkaConsumer('twitter',
                         group_id='my-group',
                         bootstrap_servers=['localhost:9092'])
nbModel = PipelineModel.load("APJ180001_nb.model")
lrModel = PipelineModel.load("APJ180001_lr.model")

evaluator = MulticlassClassificationEvaluator()
count=0;
sum=0;
avg=0;
sum2=0
count2=0
labels={}
index = 0

accidentalTweetsNB = open("accidentalTweetsNB.txt", 'a+')
accidentalTweetsLR = open("accidentalTweetsLR.txt", 'a+')

for message in consumer:

Exemple #15

0

Afficher le fichier

Fichier : component_resolution.py Projet : JohnSnowLabs/nlu

def construct_component_from_pipe_identifier(
        language,
        nlp_ref,
        nlu_ref,
        path=None,
        is_licensed=False):  # -> NLUPipeline
    """
    creates a list of components from a Spark NLP Pipeline reference
    1. download pipeline
    2. unpack pipeline to annotators and create list of nlu components
    3. return list of nlu components
    :param is_licensed: Weather pipe is licensed or not
    :param nlu_ref: Nlu ref that points to this pipe
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp pretrained pipeline
    :param path: Load component_list from HDD
    :return: Each element of the Spark NLP pipeline wrapped as a NLU component inside a list
    """
    if 'language' in nlp_ref:
        # special edge case for lang detectors
        language = 'xx'
    if path is None:
        if is_licensed:
            pipe = PretrainedPipeline(nlp_ref,
                                      lang=language,
                                      remote_loc='clinical/models')
        else:
            pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else:
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []
    os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict()
    hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict()
    ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict()
    for jsl_anno_object in iterable_stages:
        anno_class_name = type(jsl_anno_object).__name__
        logger.info(
            f"Extracting model from Spark NLP pipeline: obj= {jsl_anno_object} class_name = {anno_class_name} and creating Component"
        )
        if anno_class_name in os_annos.keys():
            jsl_anno_id = os_annos[anno_class_name]
            nlu_component = ComponentMap.os_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.open_source)
            constructed_components.append(nlu_component)
        elif anno_class_name in hc_annos.keys():
            # Licensed HC
            jsl_anno_id = hc_annos[anno_class_name]
            nlu_component = ComponentMap.hc_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.hc)
            constructed_components.append(nlu_component)
        elif anno_class_name in ocr_annos:
            # Licensed OCR (WIP)
            jsl_anno_id = ocr_annos[anno_class_name]
            nlu_component = ComponentMap.ocr_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.ocr)
            constructed_components.append(nlu_component)
        else:
            raise ValueError(
                f'Could not find matching nlu component for annotator class = {anno_class_name}'
            )
        if None in constructed_components or len(constructed_components) == 0:
            raise Exception(
                f"Failure inferring type anno_class={anno_class_name} ")
    return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
        PipeUtils.set_column_values_on_components_from_pretrained_pipe(
            constructed_components, nlp_ref, language, path))

Exemple #16

0

Afficher le fichier

run_mode = 'batch'  # 'batch' or 'streaming'
outputs = 'outputs.json'
#################################################

if len(sys.argv) != 2:
    print("Usage: <models folder> ")
    exit(1)

models_folder = sys.argv[1]

spark = SparkSession.builder.master('local[{}]'.format(spark_threads)).appName(
    'local-testing-pyspark-context').getOrCreate()

# Load the model
print("Loading model...")
model = PipelineModel.load(models_folder + '/' + ml_model_name)
print("Done.")

#with open(models_folder + '/' + input_dtypes, 'r') as f:
#    types = {x:np.dtype(y) for x, y in json.load(f).items()}


def read_data():
    print("Reading data")
    df = pd.read_json(stage2_outputs)
    if sample_size:
        df = df.sample(sample_size)
    print("Size of test data is : ", len(df))
    return df

Exemple #17

0

Afficher le fichier

             header=True,
             maxFilesPerTrigger=1)\
    .load()

# если в числовом поле LotFrontage есть null, всю строку записывает нулевой
# поэтому задаем в схеме строчный тип, потом меняем на Float и заменяем null-значения
# визуально нет Id c null-значением, откуда потом вылетают - не понимаю
test = test_start\
    .withColumn("LotFrontage", F.expr("CAST(LotFrontage as FLOAT)"))\
    .na.fill({"LotFrontage": 60.0, "Id": 0})

out = console_output(test.select("Id", "LotFrontage", "LotArea"), 100)
out.stop

# довольно долго подгружается
pipeline_model = PipelineModel.load("my_GB_model8_ob")
"""
/cassandra/bin/cqlsh 10.0.0.18 — запуск

#создать схему
#CREATE  KEYSPACE  lesson8
#   WITH REPLICATION = {
#      'class' : 'SimpleStrategy', 'replication_factor' : 1 } ;

use lesson8;

DROP TABLE houses_price_prediction;
CREATE TABLE IF NOT EXISTS houses_price_prediction
(Id int primary key, 
SalePrice int);
"""

Exemple #18

0

Afficher le fichier

def main(sentiment_input, user_input, review_input, model_input,
         output_folder):
    # read input files
    df_sentiment = spark.read.csv(sentiment_input, header=True)
    df_user = spark.read.parquet(user_input)
    df_review = spark.read.parquet(review_input)

    # get 50 users
    df_50_users = df_user.limit(50)

    # cross join user and business
    df_usr_bus_all = df_50_users \
                    .crossJoin(df_sentiment) \
                    .where(df_sentiment['ZipCode'].isNull() == False) \
                            .select(
                                df_sentiment['BusinessID'], \
                                df_user['UserID'], \
                                df_user['UserName'], \
                                df_user['ReviewCount'].alias('UserReviewCount'), \
                                df_user['AverageStars'].alias('UserAverageStars'), \
                                functions.lit(0).alias('ReviewStars'), \
                                functions.dayofyear(functions.current_date()).alias('ReviewDayOfYear'), \
                                df_sentiment['Name'].alias('BusinessName'), \
                                df_sentiment['ZipCode'].alias('BusinessPostalCode'), \
                                df_sentiment['ZipCode'].substr(1, 3).alias('BusinessNeighborhood'), \
                                df_sentiment['Latitude'].cast(types.FloatType()), \
                                df_sentiment['Longitude'].cast(types.FloatType()), \
                                df_sentiment['avg_neg'].cast(types.FloatType()).alias('AverageNegative'), \
                                df_sentiment['avg_neu'].cast(types.FloatType()).alias('AverageNeutral'), \
                                df_sentiment['avg_pos'].cast(types.FloatType()).alias('AveragePositive'), \
                                df_sentiment['avg_composite_score'].cast(types.FloatType()).alias('AverageComposite'))

    # left join with reviews
    df_joined = df_usr_bus_all.join(df_review, ['BusinessID', 'UserID'], 'left_outer') \
                            .select(df_review['ReviewID'], \
                                    df_usr_bus_all['BusinessID'], \
                                    df_usr_bus_all['UserID'], \
                                    df_usr_bus_all['UserName'], \
                                    df_usr_bus_all['UserReviewCount'], \
                                    df_usr_bus_all['UserAverageStars'], \
                                    df_usr_bus_all['ReviewStars'], \
                                    df_usr_bus_all['ReviewDayOfYear'], \
                                    df_usr_bus_all['BusinessName'], \
                                    df_usr_bus_all['BusinessPostalCode'], \
                                    df_usr_bus_all['BusinessNeighborhood'], \
                                    df_usr_bus_all['Latitude'], \
                                    df_usr_bus_all['Longitude'], \
                                    df_usr_bus_all['AverageNegative'], \
                                    df_usr_bus_all['AverageNeutral'], \
                                    df_usr_bus_all['AveragePositive'], \
                                    df_usr_bus_all['AverageComposite'])

    # get restaurants that user has not visited
    df_not_visited_rests = df_joined.where(df_joined['ReviewID'].isNull())

    # load the model
    loaded_model = PipelineModel.load(model_input)

    # use the model to make predictions
    predictions = loaded_model.transform(df_not_visited_rests)
    predictions_init = predictions.select(predictions['BusinessID'], \
                                          predictions['BusinessName'], \
                                          predictions['BusinessPostalCode'], \
                                          predictions['BusinessNeighborhood'], \
                                          predictions['UserID'], \
                                          predictions['UserName'], \
                                          predictions['UserReviewCount'], \
                                          predictions['UserAverageStars'], \
                                          predictions['ReviewDayOfYear'], \
                                          predictions['prediction'].alias('PredictedReviewStar'), \
                                          predictions['Latitude'], \
                                          predictions['Longitude'], \
                                          predictions['AverageNegative'], \
                                          predictions['AverageNeutral'], \
                                          predictions['AveragePositive'], \
                                          predictions['AverageComposite'])

    # change scores > 5 to 5 and < 0 to 0
    predictions_final = predictions_init.withColumn('FinalStar', \
                                                        functions.when(predictions_init["PredictedReviewStar"] >= 5, 5) \
                                                        .otherwise(functions.when(predictions_init["PredictedReviewStar"] <= 0, 0) \
                                                        .otherwise(predictions_init['PredictedReviewStar'])))

    # partition By user
    window = Window.partitionBy(predictions_final['UserID']).orderBy(
        predictions_final['FinalStar'].desc())

    # get top 10 scores for each user based on partition
    prediction_to_save = predictions_final.select(
        '*',
        functions.row_number().over(window).alias('rank')).filter(
            col('rank') <= 10)

    # save predictions to output
    prediction_to_save.coalesce(1).write.csv(output_folder + '/TestModel',
                                             header=True)

Exemple #19

0

Afficher le fichier

from pyspark import SparkContext
from pyspark.ml import PipelineModel
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from nltk.corpus import stopwords


if __name__ == '__main__':

    sc = SparkContext.getOrCreate()
    pipeline = PipelineModel.load('hdfs:///model_lr')

    file_path = '/data/twitter_data/z_sample.csv'

    # create a spark context
    sc = SparkContext.getOrCreate()

    # create a sql spark context
    sql = SQLContext(sc)

    # defining a schema for the data
    schema = StructType([
        StructField('polarity', IntegerType(), True),
        StructField('id', StringType(), True),
        StructField('date', StringType(), True),
        StructField('query', StringType(), True),
        StructField('user', StringType(), True),
        StructField('text', StringType(), True)
    ])
    useless_columns = ['id', 'date', 'query', 'user']

Exemple #20

0

Afficher le fichier

def main(file1, file2, input_model, u_id, sim_bus_limit=3):
    data = spark.read.parquet(file1)
    data.createOrReplaceTempView('review')
    df_business = spark.read.parquet(file2)
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("score", IntegerType(), True),
        StructField("input_business_id", StringType(), True)
    ])

    similar_businesses_df = spark.createDataFrame([], schema)
    df = data.select('business_id', 'text')
    #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100)
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed(
        '_1', 'business_id').withColumnRenamed('_2', 'text')

    # create text preprocessing pipeline
    # Build the pipeline
    # tokenize review
    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='text',
                                    outputCol='text_token')
    #yelpTokenDF = regexTokenizer.transform(review_df)

    # filter stopwords
    stopWordsRemover = StopWordsRemover(inputCol='text_token',
                                        outputCol='nonstopwrd')
    #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF)

    # TF
    countVectorizer = CountVectorizer(inputCol='nonstopwrd',
                                      outputCol='raw_features',
                                      minDF=2)
    #yelp_CountVec = cv.transform(yelp_remove_df)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="idf_vec")
    word2Vec = Word2Vec(vectorSize=500,
                        minCount=5,
                        inputCol='nonstopwrd',
                        outputCol='word_vec',
                        seed=123)
    #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
    pipeline = Pipeline(stages=[
        regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec
    ])
    #pipeline_model = pipeline.fit(review_df)
    #pipeline_model.write().overwrite().save('content_userid')

    pipeline_model = PipelineModel.load(input_model)
    reviews_by_business_df = pipeline_model.transform(review_df)
    all_business_vecs = reviews_by_business_df.select(
        'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
    usr_rev_bus = spark.sql(
        'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"'
        .format(u_id))

    bus_list = [i for i in usr_rev_bus.collect()]

    for b_id in bus_list:
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0]
        similar_business_rdd = sc.parallelize(
            (i[0], float(CosineSim(input_vec, i[1])))
            for i in all_business_vecs)
        similar_business_df = spark.createDataFrame(
            similar_business_rdd).withColumnRenamed(
                '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy(
                    "score", ascending=False)
        similar_business_df = similar_business_df.filter(
            col("business_id") != b_id[0]).limit(10)
        similar_business_df = similar_business_df.withColumn(
            'input_business_id', lit(b_id[0]))
        # get restaurants similar to the user_id
        result = similar_businesses_df.union(similar_business_df)
    result.cache()
    # filter out those have been reviewd before by the user
    d = [i[0] for i in usr_rev_bus.collect()]
    df_1 = result.filter(~(col('business_id').isin(d))).select(
        'business_id', 'score')
    #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')])
    df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit)
    df_result = df_business.join(df_2, 'business_id',
                                 'right').select('business_id', 'score',
                                                 'name', 'categories',
                                                 'latitude', 'longitude')
    df_result.show()

Exemple #21

0

Afficher le fichier

Fichier : run.py Projet : dalpengholic/Udacity_Capstone_Prediction_Churn_with_Spark

from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml import PipelineModel, Pipeline
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField, SelectField, FloatField
from wtforms.widgets import html5


app = Flask(__name__)

# Initiate SparkSession
spark = SparkSession.builder \
    .master('local') \
    .appName('sparkify') \
    .getOrCreate()

# load model
model_gbt = PipelineModel.load("../model/sparkify_model")

# Load dataframe
df_ML = spark.read.parquet("../model/sparkify.parquet")
df_pd = df_ML.toPandas()  
location_list = df_pd.location_first.unique().tolist()
location_list = [ (location,location) for location in location_list]

# Graph 1
graph1 = df_pd.groupby('gender')['churn'].value_counts(normalize=True).unstack()
g1_name1 = str(graph1.columns[0])
g1_name2 = str(graph1.columns[1])
g1_x = graph1.index
g1_y1 = graph1.values.T[0]
g1_y2 = graph1.values.T[1]
# Graph 2

Exemple #22

0

Afficher le fichier

import numpy as np
from flask import Flask, request, abort

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import StringIndexerModel

app = Flask(__name__)

sc = SparkContext('local')
sqlContext = SQLContext(sc)
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
lr_model = PipelineModel.load('/spark_models/lr_model')
bc_model = PipelineModel.load('/spark_models/bc_model')
mc_model = PipelineModel.load('/spark_models/mc_model')

# https://stackoverflow.com/questions/45885044/getting-labels-from-stringindexer-stages-within-pipeline-in-spark-pyspark
mc_classes = classes = {
    x._java_obj.getOutputCol(): x.labels
    for x in mc_model.stages if isinstance(x, StringIndexerModel)
}
mc_classes = mc_classes['label']


@app.route('/lr')
def linear_regression():
    try:
        df = sqlContext.createDataFrame(

Exemple #23

0

Afficher le fichier

def load_lda_model(spark):
    register_remove_punctuation_udf(spark)
    ldaPipelineModel = PipelineModel.load(
        "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel")
    #ldaPipelineModel.stages[0] = SQLTransformer(statement="SELECT jokeID, clean_text_udf(raw_text) text FROM __THIS__")
    return ldaPipelineModel

Exemple #24

0

Afficher le fichier

def load_model(building_id, meter):

    model_path = "output/als_model_{0}_{1}".format(building_id, meter)
    return PipelineModel.load(model_path)

Exemple #25

0

Afficher le fichier

Fichier : tools.py Projet : JosemyDuarte/twitterJudge

def cargar_juez(path, tipo, mongo_uri=None):
    if tipo == 1 and mongo_uri:
        df = spark_session().read.json(path + "_trainingset")
        df.rdd.map(lambda t: t.asDict()).saveToMongoDB(mongo_uri)
    return PipelineModel.load(path)

Exemple #26

0

Afficher le fichier

    lambda x: datetime.strftime(
        datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S'
        )
    )
df = df.withColumn("created_at", date_process(df.created_at))

################# Pre-processing the data
pre_process = udf(
    lambda x: re.sub(r'[^A-Za-z\n ]|(http\S+)|(www.\S+)', '', \
        x.lower().strip()).split(), ArrayType(StringType())
    )
df = df.withColumn("cleaned_data", pre_process(df.message)).dropna()

################# Passing into ml pipeline
model_path = SRC_DIR.joinpath('models')
pipeline_model = PipelineModel.load(model_path)

prediction  = pipeline_model.transform(df)

'''
The labels are labelled with positive (4) as 0.0 
negative (0) as 1.0
'''

prediction = prediction \
    .select(prediction.cleaned_data, prediction.created_at, \
         prediction.timestamp, prediction.message, prediction.prediction)

# print(prediction.schema)
################# Write to Delta

Exemple #27

0

Afficher le fichier

Fichier : web_app.py Projet : chezou/cdsw-simple-serving-python

from flask import Flask, jsonify, render_template, request
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel


import json

MASTER = 'local'
APPNAME = 'simple-ml-serving'
MODEL_PATH = 'file:///home/cdsw/cdsw-simple-serving-python/model/spark-model'

spark = SparkSession.builder.master(MASTER).appName(APPNAME).getOrCreate()
model = PipelineModel.load(MODEL_PATH)


def classify(input):
  #target_columns = input.columns + ["prediction"]
  target_columns = ["prediction"]
  return model.transform(input).select(target_columns).collect()

# webapp
app = Flask(__name__)


@app.route('/api/predict', methods=['POST'])
def predict():
  input_df = spark.sparkContext.parallelize([request.json]).toDF()
  output = classify(input_df)
  return jsonify(input=request.json, prediction=output)

@app.route('/')

Exemple #28

0

Afficher le fichier

Fichier : predict_churn_mlp_pyspark.py Projet : ziedbouf/ChurnBabyChurn

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import PipelineModel
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder \
      .appName("Telco Customer Churn") \
      .master("local[*]") \
      .getOrCreate()
  
model = PipelineModel.load("models/spark/mlp") 

features = ["intl_plan", "account_length", "number_vmail_messages", "total_day_calls",
                        "total_day_charge", "total_eve_calls", "total_eve_charge",
                        "total_night_calls", "total_night_charge", "total_intl_calls", 
                        "total_intl_charge","number_customer_service_calls"]
def predict(args):
  account=args["feature"].split(",")
  feature = spark.createDataFrame([account[:1] + list(map(float,account[1:12]))], features)
  result=model.transform(feature).collect()[0].prediction
  return {"result" : result}

#features = ["intl_plan_indexed","account_length", "number_vmail_messages", "total_day_calls",
#                     "total_day_charge", "total_eve_calls", "total_eve_charge",
#                     "total_night_calls", "total_night_charge", "total_intl_calls", 
#                    "total_intl_charge","number_customer_service_calls"
predict({
  "feature": "no, 128, 25, 256, 110, 197.4, 50, 244.7, 91, 10, 5, 1"
})

Exemple #29

0

Afficher le fichier

print(str(edate))

# paragraph 3 - send start email

send_mail(
    "Email Cadence Model Scoring Starting",
    "*****@*****.**",
    "Email Cadence Model Scoring: " + str(today),
)

# paragraph 4 - loading model from file

try:
    # load model from exported file
    lrModel = PipelineModel.load(
        "/user/datascience/test_db.db/features/lrModel")

    print("Model Loaded")
except Exception, e:
    print(str(e))
    errormsg = errormsg + "Model Failed Loading\n"

# paragraph 5 - data config

###SCORING
print("Scoring")

try:
    ##change date as parameter to take in current date (or date of score)
    scoring_query = "select * from test.features where ds = '" + str(
        edate) + "'"

Exemple #30

0

Afficher le fichier

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner])

## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
pipeline.write().overwrite().save("examples/build/pipeline")
loaded_pipeline = Pipeline.load("examples/build/pipeline")

## Train the pipeline model
data = load()
model = loaded_pipeline.fit(data)

model.write().overwrite().save("examples/build/model")
loaded_model = PipelineModel.load("examples/build/model")


##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model):
    smsTextDF = spark.createDataFrame([(smsText, )],
                                      ["text"])  # create one element tuple
    prediction = model.transform(smsTextDF)
    return prediction.select("prediction").first()["prediction"] == "spam"


isSpamMsg = isSpam("Michal, h2oworld party tonight in MV?", loaded_model)
assert not isSpamMsg

Exemple #31

0

Afficher le fichier

# finding spark
#import findspark
#findspark.init('/home/pfcor/spark-2.1.0-bin-hadoop2.7')

# misc
import datetime as dt
timestamp = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')

# init
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BANK_MODELO").getOrCreate()

# carregando modelo
from pyspark.ml import PipelineModel
pipelineModel = PipelineModel.load(
    'hdfs://elephant:8020/user/labdata/model/bank-pipeline-model-res/')
#pipelineModel = PipelineModel.load('model/bank-pipeline-model-res/')

# carregando dados
#data = spark.read.csv(
#    'data/new-data.csv',
#    sep=';',
#    header=True,
#    inferSchema=True
#)
data = spark.read.csv("hdfs://elephant:8020/user/labdata/new-data.csv",
                      header=True,
                      sep=";",
                      inferSchema=True)
data = data.selectExpr(
    *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])

Exemple #32

0

Afficher le fichier

Fichier : Streaming_Model.py Projet : papaulul/Yelp_Sentiment

def getSparkSessionInstance(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']
if __name__ == "__main__":
	# Spark Context
	sc = SparkContext("local[2]",appName = "StreamingReviews")
	sc.setLogLevel("ERROR")
	# Update Stream every 10 seconds
	ssc = StreamingContext(sc,10)
	# Load Model 
	lr_model = PipelineModel.load('./Model')
	#Create DStream from data source
	lines = ssc.textFileStream('./Test')
	#Transformations and actions on DStream
	text = lines.map(lambda x: x[1:-1])
	def process(time, rdd):
		print("========= %s =========" % str(time))
		try:
			# Get the singleton instance of SparkSession
			spark = getSparkSessionInstance(rdd.context.getConf())
			# Remove Header
			head = rdd.first()
			rdd = rdd.filter(lambda x: x != head)
			# Convert RDD[String] to RDD[Row] to DataFrame
			rowRdd = rdd.map(lambda w: Row(text=w.encode('utf-8')))
			# Create new Data Frame

Exemple #33

0

Afficher le fichier

def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))

Exemple #34

0

Afficher le fichier

Fichier : prediction_with_rf.py Projet : liumanman001/phcleanning

		# spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
		spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.cn-northwest-1.amazonaws.com.cn")

	return spark



if __name__ == '__main__':
	spark = prepare()

	# 1. load the data
	df_result = load_training_data(spark)
	df_validate = df_result #.select("id", "label", "features").orderBy("id")

	# 2. load model
	model = PipelineModel.load("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/rf")

	# 3. compute accuracy on the test set
	predictions = model.transform(df_validate)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g " % (1.0 - accuracy))
	print("Test set accuracy = " + str(accuracy))

	# 4. Test with Pharbers defined methods
	result = predictions
	# result.printSchema()
	result = result.withColumn("JACCARD_DISTANCE_MOLE_NAME", result.JACCARD_DISTANCE[0]) \
				.withColumn("JACCARD_DISTANCE_DOSAGE", result.JACCARD_DISTANCE[1]) \
				.drop("JACCARD_DISTANCE", "features", "indexedFeatures").drop("rawPrediction", "probability")
	# result.orderBy("id").repartition(1).write.mode("overwrite").csv("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/result")

Exemple #35

0

Afficher le fichier

#   Run App
#
################################################################################################

if __name__ == "__main__":
    
    from pyspark.sql import SparkSession
    from pyspark.ml import PipelineModel
    
    spark = SparkSession \
        .builder \
        .config("spark.driver.allowMultipleContexts", "true") \
        .appName("pyspark_nfl_app") \
        .getOrCreate()
    
    model_pass = PipelineModel.load('/assets/static/assets/nfl_model_pass')
    model_run  = PipelineModel.load('/assets/static/assets/nfl_model_run')    
    #model_pass = PipelineModel.load('./static/assets/nfl_model_pass')
    #model_run  = PipelineModel.load('./static/assets/nfl_model_run')
    
    #app.run(debug=True, threaded=False, host='0.0.0.0', port=4444)
    app.run(threaded=False, host='0.0.0.0', port=4444)



'''

0   Date
1   GameID
2   Drive
3   qtr