Beispiel #1
0
 def updateResources(duration_model_path, crowdedness_model_path,
                     pipeline_path, routes_stops_path):
     self.duration_model = LinearRegressionModel.load(duration_model_path)
     self.crowdedness_model = LinearRegressionModel.load(
         crowdedness_model_path)
     self.pipeline = PipelineModel.load(pipeline_path)
     self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler(
         self.sc, self.sqlContext, routes_stops_path)
Beispiel #2
0
 def __init__(self, app_name, duration_model_path, crowdedness_model_path,
              pipeline_path, routes_stops_path):
     self.sc = SparkContext(conf=SparkConf().setAppName(app_name))
     self.sqlContext = SQLContext(self.sc)
     self.duration_model = LinearRegressionModel.load(duration_model_path)
     self.crowdedness_model = LinearRegressionModel.load(
         crowdedness_model_path)
     self.pipeline = PipelineModel.load(pipeline_path)
     self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler(
         self.sc, self.sqlContext, routes_stops_path)
Beispiel #3
0
def predict(bucket_name, feature_path, feature_name, output_path, plot_path):    
    sc = SparkContext.getOrCreate()
    sqlCtx = SQLContext(sc)

    model_path = path.join(output_path, "regression-model")
    print "Load model from:", model_path
    lrModel = LinearRegressionModel.load(model_path)

    # read last maintenance time from json
    maintain4 = 0.0
    maintain12 = 0.0
    with open(path.join(output_path, "last_maintain.json")) as f:
        last_maintain = json.load(f)
        maintain4 = last_maintain['maintain4']
        maintain12 = last_maintain['maintain12']

    # read data from s3 for prediction
    df = read_data(bucket_name, feature_path, feature_name)
    # transform predict data
    df = df.withColumn('maintain4', lit(maintain4))
    df = df.withColumn('maintain12', lit(maintain12))
    test = df.rdd.map(
        lambda x: (
            Vectors.dense([x.amount, x.split, x.maintain4, x.maintain12]),
        )
    ).toDF(["features"])

    lrModel.transform(test).toPandas().to_csv(
        path_or_buf=path.join(output_path, "pred-" + feature_name))
Beispiel #4
0
class CustomModel:

    model = LinearRegressionModel()

    def __init__(self, pmodel):
        self.model = pmodel
        #self.model = LinearRegressionModel(pmodel)
        model = pmodel

    def __getstate__(self):
        # Copy the object's state from self.__dict__ which contains
        # all our instance attributes. Always use the dict.copy()
        # method to avoid modifying the original state.
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        del state['model']
        return state
        #return self.__dict__

    def __setstate__(self, state):
        # Restore instance attributes (i.e., filename and lineno).
        self.__dict__.update(state)

    def getModel(self):
        return self.model
Beispiel #5
0
def main(bootstrap_server, topic_name, time_interval, scikit_model_path,
         spark_model_path, output_attribute_index):

    # Ucitavanje Passive Aggressive Regressor modela
    with hdfs.open(scikit_model_path, 'r') as opened_file:
        regressor = pickle.load(opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Consumer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog ucitavanja modela)
    session = SparkSession(context)

    # Ucitavanje Spark modela sa zadate putanje
    model = LinearRegressionModel.load(spark_model_path)

    # Instanciranje streaming konteksta
    # (tako da se obrada izvrsava na svakih time_interval sekundi),
    # kao i stream-a uz zadat topik i Kafka broker
    streaming_context = StreamingContext(context, time_interval)
    stream = KafkaUtils.createDirectStream(
        streaming_context, [topic_name],
        {"metadata.broker.list": bootstrap_server})

    # Za svaki RDD, definise se funkcija koja vrsi predikciju
    # (uz odgovarajuci prosledjen model i indeks izlaznog atributa)
    stream.foreachRDD(lambda input_data: prediction(
        input_data, regressor, model, output_attribute_index))

    # Zapocinje se obrada stream-a
    streaming_context.start()
    streaming_context.awaitTermination()
 def getOrCreateLR (self):
     try:
         if self.lrModel == None:
             self.lrModel = LinearRegressionModel.load(CONST_LR_FILE)
     except :
         print("Creating LR Model")
         self.lrModel =  self.createLR ()
     
     return self.lrModel
Beispiel #7
0
def loadModels(path,typeofmodel):
  models = {}
  for park in park_data_with_date_dict:
    if typeofmodel == "linear":
      models[park] = LinearRegressionModel.load(path+str(park))
    elif typeofmodel == "tree":
      models[park] = DecisionTreeRegressionModel.load(path+str(park))
    elif typeofmodel == "gbt":
      models[park] = GBTRegressionModel.load(path+str(park))
  return models
Beispiel #8
0
 def load(self, load_dir):
     if os.path.isdir(load_dir):
         if self.pm == 'PM10':
             self.model = LinearRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         else:
             self.model = RandomForestRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         self.imputer = ImputerModel.load(os.path.join(load_dir, 'imputer'))
         self.assembler = VectorAssembler.load(
             os.path.join(load_dir, 'assembler'))
     else:
         raise RuntimeError(
             'Save path: {}, does not exist or is not a directory'.format(
                 load_dir))
    def process(self, data_input, data_output, model):
        """
        An spark process to do inference
        :param data_input: data input filename
        :param data_output: data output filename
        """

        # Load Linear Regression model
        lr_model = LinearRegressionModel.load(model)

        new_data = self.spark.read.parquet(data_input)

        new_predictions = lr_model.evaluate(new_data)

        # Save result as parquet
        new_predictions.write.format("parquet").mode('overwrite').option(
            "header", "true").save(data_output)
def predict():
    """
    https://app.host/predict?value=0
    """
    value = int(request.args.get("value"))
    spark_session, _ = create_spark_connection()
    model_load = LinearRegressionModel.load(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), "model"))
    predict_df = spark_session.createDataFrame([(1, Vectors.dense(value))],
                                               ["index", "features"])

    predict_collected = model_load.transform(predict_df).collect()[0]

    features = predict_collected.features.values.tolist()
    prediction = predict_collected.prediction
    output = {"features": features, "prediction": prediction}
    return jsonify(output)
Beispiel #11
0
def find_rating(player_id, cur_date):
    sp_sess = SparkSession.builder.appName('Regr_Data').getOrCreate()
    play_path = "hdfs://localhost:9000/players.csv"
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    assembler = VectorAssembler(inputCols = ['new_diff'],outputCol = 'features')
    name_df = players.filter(players['Id'] == int(player_id))
    
    player_date = name_df.select("birthDate").collect()[0].birthDate

    new_date1 = player_date.split('-')
    new_date2 = cur_date.split('-')


    d1 = datetime.date(int(new_date1[0]), int(new_date1[1]), int(new_date1[2]))
    d2 = datetime.date(int(new_date2[0]), int(new_date2[1]), int(new_date2[2]))


    diff = abs(d2 - d1).days
    my_rating = 1.000
    my_schema = StructType([
        StructField('diff', IntegerType(), True),
        StructField('rating', FloatType(), True)
    ])
    my_dict = {'diff': diff, 'rating': my_rating}

    new_df = sp_sess.createDataFrame([my_dict], my_schema)

    new_df = new_df.withColumn('new_diff', new_df['diff'] / 1000)
    new_df = new_df.withColumn('new_rating', new_df['rating'] * 10)

    test = assembler.transform(new_df)
    final_model = LinearRegressionModel.load('reg_model')

    res = final_model.evaluate(test)

    req = res.predictions.select("prediction").rdd.flatMap(lambda x : x).collect()

    final_res = req[0] / 10

    if (final_res > 1):
        final_res /= 2
    if (final_res > 0.9):
        final_res /= 2

    return abs(final_res)
Beispiel #12
0
def linear_regression():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                (0.0, 2.0, Vectors.sparse(1, [], []))],
                               ["label", "weight", "features"])
    lr = LinearRegression(maxIter=5,
                          regParam=0.0,
                          solver="normal",
                          weightCol="weight")
    model = lr.fit(df)
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
    # True
    abs(model.coefficients[0] - 1.0) < 0.001
    # True
    abs(model.intercept - 0.0) < 0.001
    # True
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    abs(model.transform(test1).head().prediction - 1.0) < 0.001
    # True
    lr.setParams("vector")
    # Traceback (most recent call last):
    #    ...
    # TypeError: Method setParams forces keyword arguments.
    temp_path = "./"
    lr_path = temp_path + "/lr"
    lr.save(lr_path)
    lr2 = LinearRegression.load(lr_path)
    lr2.getMaxIter()
    # 5
    model_path = temp_path + "/lr_model"
    model.save(model_path)
    model2 = LinearRegressionModel.load(model_path)
    model.coefficients[0] == model2.coefficients[0]
    # True
    model.intercept == model2.intercept
    # True
    model.numFeatures
Beispiel #13
0
def main():
    
    feature_model = VectorIndexerModel.load(featureIndexer_path)
    vectorAssembler = vectorAssembler.load(vectorAssembler_path)
    ohe_model = OneHotEncoderModel.load(ohe_model_path)
    stringIndexer_model = StringIndexerModel.load(stringIndexerPath)
    lr_model = LinearRegressionModel.load(model_path)
    
    spark = SparkSession.builder.master("local").appName("Connection").getOrCreate()
    
    json_data = request.get_json()
    
    availability = json_data.availability
    minimum_nights = json_data.minimum_nights
    latitude = json_data.latitude
    longitude = json_data.longitude
    name = json_data.name
    neighbourhood_group = json_data.neighbourhood_group
    neighbourhood = json_data.neighbourhood
    room_type = json_data.room_type
    
    dept = [(name,neighbourhood_group,neighbourhood,room_type,latitude,longitude,0.0,minimum_nights,0.0,1.0,availability,0.0)]

    df = spark.createDataFrame(data=dept, schema = deptColumns)
    
    df = stringIndexer_model.transform(df)
    
    df = df.drop(*["neighbourhood_group", 'neighbourhood', 'room_type'])
    df = ohe_model.transform(df)
    df = df.drop(*["neighbourhood_group_int", 'neighbourhood_int', 'room_type_int'])

    df = df.withColumn("minimum_nights", when(df["minimum_nights_int"] > 30, 30).otherwise(df["minimum_nights_int"])).drop('minimum_nights_int')
    df = df.withColumn('name_length', length('name')).drop('name')

    df = vectorAssembler.transform(df)
    df = df.select(['features'])
    df = feature_model.transform(df)
    df = df.select(['features_vec'])

    lr_predictions = lr_model.transform(df)
    
    return jsonify(data=lr_predictions.collect()[-1].prediction)
        
def loadModelLinearRegression(conf, path):
    """
       input  : conf, path
       output : model (CrossValidatorModel / TrainValidationSplitModel / LinearRegressionModel)
    """
                   
    #Jika menggunakan ML-Tuning
    if conf["tuning"]:    
        #Jika menggunakan Cross Validation, maka tipe model = CrossValidatorModel
        if conf["tuning"].get("method").lower() == "crossval":
            load_model = CrossValidatorModel.load(path)        
        #Jika menggunakan Train Validation, maka tipe model = TrainValidationSplitModel   
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            load_model = TrainValidationSplitModel.load(path)
    
    #Jika tidak menggunakan ML-tuning, tipe model = LinearRegressionModel    
    elif conf["tuning"] == None:
        load_model = LinearRegressionModel.load(path)
    
    return load_model
Beispiel #15
0
    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData
Beispiel #16
0
    def __init__(self):
        print('== [Model] Creating spark session...')
        self.spark = SparkSession.builder.appName('lin_reg_api').getOrCreate()
        #self.spark = SparkSession.newSession()
        print('== [Model] spark version', self.spark.version)
        print('== [Model] Loading model...')
        self.model = LinearRegressionModel.load('model_lin_reg')
        print('== [Model] Loading complete...')

        self.entire_Set = self.spark.read.csv('./airfoil_self_noise.csv',
                                              header=True,
                                              inferSchema=True)

        # define transformers...
        self.airfoil_assembler = VectorAssembler(inputCols=X_Cols,
                                                 outputCol='_features')
        freq_scaler = StandardScaler(inputCol="_features",
                                     outputCol="features")

        tuned_input_vec = self.airfoil_assembler.transform(
            self.entire_Set).select('_features')
        self.std_scaler = freq_scaler.fit(tuned_input_vec)
        return
Beispiel #17
0
    conf = SparkConf().setAppName(appName).setMaster("spark://ubuntu:7077")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #load data
    data = None
    if dataType == "libsvm":
        data = sqlContext.read.format("libsvm").load(dataPath)

    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
Beispiel #18
0
def loadModel(dataset_add, feature_colm, label_colm, relation_list, relation):
    try:
        # dataset = spark.read.csv('/home/fidel/mltest/testData.csv', header=True, inferSchema=True)
        # testDataFetched =  testDataFetched.select('Independent_features', 'MPG')
        # testDataFetched.show()
        # testDataFetched.printSchema()

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)
        dataset.show()

        # renaming the colm
        # print(label_colm)
        # dataset.withColumnRenamed(label_colm, "label")
        # print(label_colm)
        # dataset.show()

        label = ''
        for y in label_colm:
            label = y

        print(label)

        dictionary_list = {
            'log_list': ["CYLINDERS"],
            'sqrt_list': ["WEIGHT"],
            'cubic_list': ["ACCELERATION"]
        }

        relationship_val = 'linear_reg'

        if relationship_val == 'linear_reg':
            print('linear relationship')
        else:
            dataset = Relationship(dataset, dictionary_list)

        dataset.show()

        # implementing the vector assembler

        featureassembler = VectorAssembler(inputCols=feature_colm,
                                           outputCol="Independent_features")

        output = featureassembler.transform(dataset)

        output.show()
        output = output.select("Independent_features")

        # finalized_data = output.select("Independent_features", label)

        # finalized_data.show()

        regressorTest = LinearRegressionModel.load(
            '/home/fidel/mltest/linearRegressorFitModel')
        predictedData = regressorTest.transform(output)

        predictedData.show()

    except Exception as e:
        print('exception ' + str(e))


#
# if __name__== '__main__':
#     loadModel()
Beispiel #19
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'Distr_LinearRegression'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)
    dir_of_storeModel = train_result_dir + '/%s_model' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("LinearRegression_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc = sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试
        #dataset = dataset[0:1000]

        Y_datavec = dataset[Y_names].values
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form(
            dataset, names_str, names_num, names_show, 'vocabset', 'open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec, normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X, Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num, ret = mlp.GS_PCA(X)
            print 'PCA Information:', pca_num, ret
            print '----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X, ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara, vocabset, ret_num)

        print '--------------Train data shape----------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print 'Y.shape:', Y.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        targets = pd.DataFrame(Y, columns=['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis=1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula='Y ~ .',
                          featuresCol="features",
                          labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],
                                               seed=666)
        #调用模型
        clf_model = dmp.Distr_LinearRegression(xy_train, xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print '----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show,
                              clf_model, dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara, 'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, datavec_show_list = too.Merge_form(dataset, names_str,
                                                      names_num, names_show,
                                                      vocabset, 'close')
        #数据归一化
        X = too.Data_process(X_datavec, normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X, ret_num)

        print '-------------Pdedict data shape---------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,
                                outputCol='features').transform(raw_features)
        clf_model = LinearRegressionModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model,
                         dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration
dataset.groupby("quality").count().show()

# ################################################################################################################
# export the trained model and create a zip file for ease of download
import shutil
from pyspark.ml.regression import LinearRegressionModel
regressor.write().overwrite().save("cs643")

path_drv = shutil.make_archive("cs643", format='zip', base_dir="cs643")
shutil.unpack_archive(
    "cs643.zip",
    "test",
    format='zip',
)

loadedRegressor = LinearRegressionModel.load("test/cs643")
predictions = loadedRegressor.transform(valid_finalized_data)
print(loadedRegressor.numFeatures)
predictions.show()

# ################################################################################################################
# run some equick evaluations
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol=dataset.columns[11],
                           predictionCol="prediction",
                           metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
Beispiel #21
0
#need to load in testing dataset

import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc= SparkContext()
sqlContext = SQLContext(sc)

print(sys.argv[1])
test_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true'
	, delimiter=';').load(sys.argv[1])
print(test_df.take(1))

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['"""""fixed acidity""""', '""""volatile acidity""""'
	, '""""citric acid""""', '""""residual sugar""""', '""""chlorides""""', '""""free sulfur dioxide""""'
	, '""""total sulfur dioxide""""', '""""density""""', '""""pH""""', '""""sulphates""""', '""""alcohol""""']
	, outputCol = 'features')
vtest_df = vectorAssembler.transform(test_df)
vtest_df = vtest_df.select(['features', '""""quality"""""'])
vtest_df.show(3)

from pyspark.ml.regression import LinearRegressionModel
lr_model = LinearRegressionModel.load('model')
lr_predictions = lr_model.transform(vtest_df)
lr_predictions.select('prediction','""""quality"""""','features').show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='""""quality"""""',metricName='r2')
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))
Beispiel #22
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegressionModel

sc = SparkContext()
sqlContext = SQLContext(sc)
model_1 = LinearRegressionModel.load("My_Model")
print("Model loaded successfully")
Beispiel #23
0
    .filter(df.dayofyear.isNotNull())

    # create a features column : list of open prices averaged by day
    df = df.groupby('symbol').agg(collect_list('avg(open)').alias("features"))

    # add a yearly average column
    yearly_avg = udf(lambda x: sum(x) / len(x), DoubleType())
    df = df.withColumn("yearly_average", yearly_avg("features"))

    # convert to vectors for the linear regression model
    array_to_vector = udf(lambda x: Vectors.dense(x[0]), VectorUDT())
    df = df.withColumn("features", array_to_vector("features"))

    # load the model and apply it
    model_path = "s3://" + bucket_name + "/models/lr_model"
    loaded_model = LinearRegressionModel.load(model_path)
    results = loaded_model.evaluate(df)
    predictions = results.predictions
    predictions = predictions.withColumn(
        "performance", ((col("prediction") / col("yearly_average")) - 1) * 100)
    performances = predictions.select("performance").rdd.map(
        lambda x: x[0]).collect()
    min_value = min(performances)
    max_value = max(performances)
    normalize = udf(lambda x: (x - min_value) / (max_value - min_value),
                    FloatType())

    # the score is the predicted price compared to the yearly average (normalized)
    predictions = predictions.select("symbol", "prediction", "performance") \
                             .withColumn("price_score", normalize("performance")) \
                             .drop("performance")
Beispiel #24
0
## Import Libraries
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

sc = SparkContext()
spark = SparkSession(sc)

## Load model
lrModel = LinearRegressionModel.load(
    'gs://spark-training-data/ml_models/sample_model.model')

## Read in the data from model_test_jc
df = spark.read.format('libsvm').load(
    'gs://spark-training-data/datasets/sample_linear_regression_data.txt')
df.show(5)

## Predict Results
predictions = lrModel.transform(df)
predictions.show(5)
Beispiel #25
0
 def test_lr_evaluate_invaild_type(self):
     lr = LinearRegressionModel()
     invalid_type = ""
     self.assertRaises(TypeError, lr.evaluate, invalid_type)
Beispiel #26
0
	valid_data_final.show()
	

	# Split training data into 80% and 20%
	train_data,test_data = data_final.randomSplit([0.8,0.2])
	regressor = LinearRegression(featuresCol = 'Attributes', labelCol = dataset.columns[11] )

	# Train using training data 
	regressor = regressor.fit(train_data)

	pred = regressor.evaluate(test_data)

	# Predict the model
	pred.predictions.show()

	predictions = regressor.transform(valid_data_final)
	predictions.show()

	# Save the model so that we can export it for later use
	regressor.write().overwrite().save("trained-model")

	path_drv = shutil.make_archive("trained-model", format='zip', base_dir="trained-model")
	shutil.unpack_archive("trained-model.zip", "trained-model-sample",format='zip',)

	loadedRegressor = LinearRegressionModel.load("trained-model-sample/trained-model")
	predictions = loadedRegressor.transform(valid_data_final)
	print(loadedRegressor.numFeatures)
	predictions.show()

	spark.stop()
import pika
import sys
import json
import pyspark
import time
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import multiprocessing
import threading

database_features_ordered = ['VendorID','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','RatecodeID','store_and_fwd_flag','PULocationID','DOLocationID','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount']
sc = pyspark.sql.SparkSession.builder.appName("nycApp").getOrCreate()
sc.sparkContext._conf.set('spark.executor.cores', multiprocessing.cpu_count())
print(sc.sparkContext._conf.getAll())
lm = LinearRegressionModel()
model_1 = lm.load("/home/gcpkey/lr.model")
topic = "streaming_data"
credentials = pika.PlainCredentials('user', 'QwwyqaQj1C4i')
parameters = pika.ConnectionParameters('35.247.117.124',5672,'/',credentials)
connection = pika.BlockingConnection(parameters)
connection1 = pika.BlockingConnection(parameters)
channel = connection.channel()
channel1 = connection1.channel()
channel1.queue_declare(queue="receivePredictedFareClient1")
channel.queue_declare(queue=topic)
def callback(ch, method, properties, body):
    df_message = pd.DataFrame.from_dict([json.loads(body.decode())])
    df_message = df_message[database_features_ordered]
    df_message_pyspark = sc.createDataFrame(df_message)
    df_message_pyspark.write.csv("hdfs://cluster-9bfd-m/hadoop/data1.csv", header=True, mode='append')
Beispiel #28
0
def load_json_and_predict(spark, sqlContext, json_file):

    # Load data to predict
    #predict_df = spark.read.json(JSON_DATA_TO_PREDICT)
    print("Loading prediction data from ", json_file)
    predict_df = spark.read.json(json_file)
    print("Done")

    # Apply same process as historical data to convert/map

    # Drop rows with NA columns
    print("Preprocessing...")
    predict_df_1 = predict_df.dropna()

    predict_df_1 = predict_df_1[
        (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) &
        (predict_df_1.max_item_price > 0) &
        (predict_df_1.total_onshift_runners >= 0) &
        (predict_df_1.total_busy_runners >= 0) &
        (predict_df_1.total_outstanding_orders >= 0) &
        (predict_df_1.estimated_order_place_duration > 0) &
        (predict_df_1.estimated_store_to_consumer_driving_duration > 0) &
        (predict_df_1.market_id != "NA") &
        (predict_df_1.store_primary_category != "NA") &
        (predict_df_1.order_protocol != "NA")]

    udf_rdd_datetimesec_to_sec = fn.udf(
        rdd_datetimesec_to_sec,
        IntegerType())  # LongType() not available for now

    predict_df_1 = predict_df_1.withColumn(
        'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at')))

    # Map store_id string to unique number
    stringindexer = StringIndexer().setInputCol("store_id").setOutputCol(
        "store_id_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    # Map store_primary_category to unique number
    stringindexer = StringIndexer().setInputCol(
        "store_primary_category").setOutputCol("store_primary_category_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    predict_df_1 = predict_df_1.withColumn(
        "market_id", predict_df_1["market_id"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "order_protocol", predict_df_1["order_protocol"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_onshift_runners",
        predict_df_1["total_onshift_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_busy_runners",
        predict_df_1["total_busy_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_outstanding_orders",
        predict_df_1["total_outstanding_orders"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_store_to_consumer_driving_duration",
        predict_df_1["estimated_store_to_consumer_driving_duration"].cast(
            IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "subtotal", predict_df_1["subtotal"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "num_distinct_items",
        predict_df_1["num_distinct_items"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_order_place_duration",
        predict_df_1["estimated_order_place_duration"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_items", predict_df_1["total_items"].cast(IntegerType()))
    print("Done")

    # Use same features as in historical data
    # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price')
    # will be dropped by VectorAssembler transformation

    print("Vectorize...")
    pvectorAssembler = VectorAssembler(inputCols=feature_list,
                                       outputCol='features')
    vectorized_predict_df = pvectorAssembler.transform(predict_df_1)
    vectorized_predict_df = vectorized_predict_df.select(['features'])
    print("Done...")

    txt_file = open(MODEL_NAME_FILE, "r")
    model_name = txt_file.read()
    print("Read model: ", model_name)
    txt_file.close()

    print("Loading model " + model_name + " from " + MODEL_DIR)

    if (model_name == DT_MODEL):
        predict_model = DecisionTreeRegressionModel.load(MODEL_DIR)

    if (model_name == GBT_MODEL):
        predict_model = GBTRegressionModel.load(MODEL_DIR)

    if (model_name == LR_MODEL):
        predict_model = LinearRegressionModel.load(MODEL_DIR)

    if (model_name == RF_MODEL):
        predict_model = RandomForestRegressionModel.load(MODEL_DIR)

    print("Done")

    print("Predicting...")
    model_predictions = predict_model.transform(vectorized_predict_df)
    print("Done")

    df1 = predict_df_1.select('delivery_id').withColumn(
        "id", monotonically_increasing_id())
    df2 = model_predictions.select('prediction').withColumnRenamed(
        'prediction',
        'predicted_delivery_seconds').withColumn("id",
                                                 monotonically_increasing_id())

    # Perform a join on the ids.
    prediction_results_df = df1.join(df2, "id", "left").drop("id")
    prediction_results_df = prediction_results_df.withColumn(
        "predicted_delivery_seconds",
        prediction_results_df["predicted_delivery_seconds"].cast(
            IntegerType()))

    return prediction_results_df
from pyspark.sql import SQLContext
from pyspark.sql.functions import hour, minute, second, col, avg, when
import pyspark.sql.functions as sql_functions
'''import kafka library for consumer'''
from kafka import KafkaConsumer
'''import kafka library for producer'''
from kafka import KafkaProducer
'''import pyspark mlib library'''
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler

sc = SparkContext()
sqlContext = SQLContext(sc)
try:
    Model_Path = "stockModel"
    load_model = LinearRegressionModel.load(Model_Path)
except:
    print("Model not Found")

consumer = KafkaConsumer('stock_price')


def stock_price_prediction():
    try:

        for msg in consumer:
            res_dict = json.loads(msg.value.decode('utf-8'))
            data_list = list(res_dict.values())
            dataframe = pd.DataFrame(
                [data_list],
                columns=['Open', 'Close', 'Volume', 'High', 'Low'])
Beispiel #30
0
def predict(sql, sc, columns, station_id, currentWeather):
    columnsToPredict = [
        "max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure",
        "precip", "insolation"
    ]
    returnedPredictions = []

    # schema = StructType([])

    field = [StructField("station_id", StringType(), True),
             StructField("max_temp", FloatType(), True), \
             StructField("max_temp", FloatType(), True), \
             StructField("med_temp", FloatType(), True), \
             StructField("min_temp", FloatType(), True), \
             StructField("max_pressure", FloatType(), True), \
             StructField("min_pressure", FloatType(), True), \
             StructField("precip", FloatType(), True), \
             StructField("insolation", FloatType(), True), \
             StructField("prediction_max_temp", FloatType(), True), \
             StructField("prediction_max_temp", FloatType(), True), \
             StructField("prediction_med_temp", FloatType(), True), \
             StructField("prediction_min_temp", FloatType(), True), \
             StructField("prediction_max_pressure", FloatType(), True), \
             StructField("prediction_min_pressure", FloatType(), True), \
             StructField("prediction_precip", FloatType(), True), \
             StructField("prediction_insolation", FloatType(), True)]

    schema = StructType(field)

    resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema)

    fields1 = [StructField("station_id", StringType(), True),
               StructField("max_temp", FloatType(), True), \
               StructField("med_temp", FloatType(), True), \
               StructField("min_temp", FloatType(), True), \
               StructField("max_pressure", FloatType(), True), \
               StructField("min_pressure", FloatType(), True), \
               StructField("precip", FloatType(), True), \
               StructField("insolation", FloatType(), True)]

    schema1 = StructType(fields1)

    resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema)
    firstTime = True

    for column in columns:
        modelPath = "models/" + station_id + "__" + column
        if not os.path.exists(modelPath):
            logger.info("####No Model")
            break

        lrModel = LinearRegressionModel.load(modelPath)

        assembler = VectorAssembler(inputCols=[column], outputCol="features")

        df_for_predict = sql.createDataFrame(
            [(
                currentWeather["station_id"],
                float(currentWeather["max_temp"]
                      ),  # if column != "max_temp" else None,
                float(currentWeather["med_temp"]
                      ),  # if column != "med_temp" else None,
                float(currentWeather["min_temp"]
                      ),  # if column != "min_temp" else None,
                float(currentWeather["max_pres"]
                      ),  # if column != "max_pres" else None,
                float(currentWeather["min_pres"]
                      ),  # if column != "min_pres" else None,
                float(currentWeather["precip"]
                      ),  # if column != "precip" else None,
                float(currentWeather["insolation"]),
                # if column != "insolation" else None,
            )],
            schema1)

        assembledTestData = assembler.transform(df_for_predict)
        prediction_data = assembledTestData.withColumn(
            "label",
            df_for_predict[column]).withColumn("features",
                                               assembledTestData.features)
        prediction_data1 = clearColumn(prediction_data, "label")

        predictions = lrModel.transform(prediction_data1,
                                        params={
                                            lrModel.intercept: True
                                        }).select("station_id", column,
                                                  "prediction")
        predictions.show()

        predictions1 = predictions.withColumn(str("prediction_" + column),
                                              predictions.prediction)

        returnedPredictions.append(
            generalFunctions.dataframeToJson(predictions1))

    return json.dumps(returnedPredictions)