def __prediction(self, df_pred):
        logger.info(
            "__prediction: LOADING PIPELINE ##################################### "
        )
        gbt_pipeline = Pipeline.load('budget_prediction_pipeline/')
        gbt_pipeline_loaded = gbt_pipeline.fit(df_pred)
        ddf_features_df = gbt_pipeline_loaded.transform(df_pred)

        logger.info(
            "__prediction: FILTERING DATA ##################################### "
        )
        ddf_features_df = ddf_features_df.filter("idmovie in(99999)")

        logger.info(
            "__prediction: LOADING MODEL ##################################### "
        )
        gbt_model_load = GBTRegressionModel.load('gbt_model_old/')
        gbt_model_pred = gbt_model_load.transform(ddf_features_df)
        gbt_model_pred.selectExpr(
            'idmovie', 'director', 'genres', 'runtime',
            'cast(prediction as Decimal(38,2)) as prediction').show(
                truncate=False)

        logger.info(
            "__prediction: DATA PREDICTED ##################################### "
        )
        return gbt_model_pred.selectExpr('director', 'genres', 'runtime',
                                         'prediction')
Ejemplo n.º 2
0
def loadModels(path,typeofmodel):
  models = {}
  for park in park_data_with_date_dict:
    if typeofmodel == "linear":
      models[park] = LinearRegressionModel.load(path+str(park))
    elif typeofmodel == "tree":
      models[park] = DecisionTreeRegressionModel.load(path+str(park))
    elif typeofmodel == "gbt":
      models[park] = GBTRegressionModel.load(path+str(park))
  return models
Ejemplo n.º 3
0
def predict():
    spark = SparkSession.builder.appName('airbnb_price').getOrCreate()
    if request.method == 'POST':
        raw_data = spark.read.json(request.json)
    model = GBTRegressionModel.load(MODEL_PATH)
    data = data_processing(raw_data)
    gbt_predictions = model.transform(data)
    output = gbt_predictions.select('prediction')
    json_output = output.toJSON()
    return jsonify({'prediction': json_output})
Ejemplo n.º 4
0
    def predict(self):
        """

        :return:
        """
        cols = [x for x in self.data.columns if x not in ['datetime', 'label']]
        assembler = VectorAssembler(handleInvalid="keep").setInputCols \
            (cols).setOutputCol("features")

        print('assembler')
        test = assembler.transform(self.data)
        test = test.drop(*cols)

        rf = GBTRegressionModel.load('myGBTRegressor_nan')
        preds = rf.transform(test)
        print(preds.printSchema())

        preds.write.save("regression_preds_5.parquet")

        return preds
Ejemplo n.º 5
0
def GBT_regressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
    print(gbt.getImpurity())
    # variance
    model = gbt.fit(df)
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    model.numFeatures
    # 1
    allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
    # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 1.0
    temp_path = "./"
    gbtr_path = temp_path + "gbtr"
    gbt.save(gbtr_path)
    gbt2 = GBTRegressor.load(gbtr_path)
    gbt2.getMaxDepth()
    # 2
    model_path = temp_path + "gbtr_model"
    model.save(model_path)
    model2 = GBTRegressionModel.load(model_path)
    model.featureImportances == model2.featureImportances
    # True
    model.treeWeights == model2.treeWeights
    # True
    model.trees
Ejemplo n.º 6
0
    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData
Ejemplo n.º 7
0
def predict(features_tab, tab_out, model_path, veh):
    # 1 配置
    spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("tianzw_vol_fading_predict_second_versions") \
        .config("spark.sql.warehouse.dir","hdfs://neicluster/user/hive/warehouse") \
        .enableHiveSupport() \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    # 2 准备数据
    sql = """
        SELECT  vin             ,
                sta_time        ,
                mils_1000km     ,
                sta_soc         ,
                charge_c        ,
                hours           ,
                temp            ,
                days            ,
                mils_dif        ,
                cnt_cha         ,
                vol_cha         ,
                vol_avg_cha     ,
                hou_cha         ,
                c_avg           ,
                sta_soc_avg_cha ,
                end_soc_avg_cha ,
                dep_soc_avg_cha ,
                sta_soc_mid_cha ,
                end_soc_mid_cha ,
                dep_soc_mid_cha ,
                cnt_tem         ,
                tem_mid_yea     ,
                tem_avg_yea     ,
                tem_dif_yea     ,
                tem_var_yea     
        FROM    """ + features_tab + """
        WHERE   veh_head = SUBSTR('""" + veh + """',0,1)
        AND     veh = '""" + veh + """'
            """
    rdd_origin = spark.sql(sql).rdd
    features_rdd = rdd_origin.map(lambda x: (
        x.vin,
        x.sta_time,
        Vectors.dense([
            x.mils_1000km, x.sta_soc, x.charge_c, x.hours, x.temp, x.days, x.
            mils_dif, x.cnt_cha, x.vol_cha, x.vol_avg_cha, x.hou_cha, x.c_avg,
            x.sta_soc_avg_cha, x.end_soc_avg_cha, x.dep_soc_avg_cha, x.
            sta_soc_mid_cha, x.end_soc_mid_cha, x.dep_soc_mid_cha, x.cnt_tem, x
            .tem_mid_yea, x.tem_avg_yea, x.tem_dif_yea, x.tem_var_yea
        ]),
    ))
    features_list = features_rdd.collect()
    print("数据提取成功")
    spark_df = spark.createDataFrame(features_list,
                                     ["vin", "sta_time", "features"])
    # 3 模型预测
    # model = GBTRegressor.load(model_path)
    model = GBTRegressionModel.load(model_path)
    print("模型导入成功")
    predictions = model.transform(spark_df)
    print("计算成功")
    new_list = [(x.vin, x.sta_time, x.prediction)
                for x in predictions.collect()]
    result_df = spark.createDataFrame(new_list,
                                      ["vin", "sta_time", "vol_fading"])
    result_df = result_df.repartition(1)
    result_df.createOrReplaceTempView("table_temp")
    # 数据写入 hive 表
    # createSQL = """
    #             CREATE EXTERNAL TABLE IF NOT EXISTS """ + tab_out + """
    #             (
    #                 vin         STRING      COMMENT '车架号',
    #                 sta_time    BIGINT      COMMENT '充电开始时间(s)',
    #                 vol_fading  DOUBLE      COMMENT '容量衰减百分比预测值(2位小数)'
    #             )
    #             PARTITIONED BY(veh      STRING      COMMENT '车型名')
    #             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    # """
    insertSql = """
                INSERT OVERWRITE TABLE """ + tab_out + """
                PARTITION(veh = '""" + veh + """')
                SELECT      vin,
                            sta_time,
                            ROUND(vol_fading,2)     AS vol_fading
                FROM        table_temp
    """
    # spark.sql("DROP TABLE IF EXISTS " + tab_out)
    # spark.sql(createSQL)
    spark.sql(insertSql)
    print(tab_out + "写入成功")
Ejemplo n.º 8
0
 def __load_from_hdfs(self):
     mdl = GBTRegressionModel.load(self.hdfs_uri)
     print("GBT() - model loaded from uri {}".format(self.hdfs_uri))
     return mdl
Ejemplo n.º 9
0
    StructField("EDUCATION", StringType(), True),
    StructField("OCCUPATION", StringType(), True),
    StructField("TRAVTIME", StringType(), True),
    StructField("BLUEBOOK", StringType(), True),
    StructField("TIF", StringType(), True),
    StructField("CAR_TYPE", StringType(), True),
    StructField("OLDCLAIM", StringType(), True),
    StructField("CLM_FREQ", StringType(), True),
    StructField("MVR_PTS", StringType(), True),
    StructField("CAR_AGE", StringType(), True)
])

print("Loading Models....")
PModel = PipelineModel.load(PipelineLoc)
LRModel = LogisticRegressionModel.load(CatModelLoc)
GBTModel = GBTRegressionModel.load(IntModelLoc)

if KafkaBserver == "<ENTER-KAFKAZBROKER-HERE>":
    print("Update Kafka server in the file")
    exit()

raw_records = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", KafkaBserver)\
    .option("subscribe", "NewUser") \
    .option("startingOffsets", "latest") \
    .load()

NestedJsonDf = raw_records.select(
    f.col("key").cast("string"),
    f.from_json(f.col("value").cast("string"), schema).alias("UserRecord"))
FlatDf = NestedJsonDf.selectExpr(
    print(output.count())
    test_features = output.na.drop()
    print(test_features.count())

    # test_output = assembler.transform(test)
    # print(test_output.count())
    # train_output = test_output.na.drop()
    # print(test_output.count())
    # print("Assembled columns 'hour', 'minute' .. to vector column 'features'")
    # test_output.show(truncate=False)#.select("features", "clicked")

    from pyspark.ml.regression import GBTRegressor, GBTRegressionModel
    # gbt = GBTRegressor(featuresCol="features", maxIter=10)
    path = "bike_sharing_gbt_file.model"

    gbt_model = GBTRegressionModel.load(path)
    # Make predictions.

    print("Before model creation")
    predictions = gbt_model.transform(test_features)
    print("After model creation")
    predictions.printSchema()
    predictions.show()
    # gbt_model.write().overwrite().save(path)
    # Select example rows to display.
    from pyspark.sql.functions import col, lit, concat
    bs_df.show()
    # predictions = predictions.withColumn("datetime", bs_df.select("datetime"))
    predictions = predictions.withColumn(
        "datetime",
        concat(col("year"), lit("-"), col("month"), lit("-"), col("day"),
Ejemplo n.º 11
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'Distr_GBTRegressor'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)
    dir_of_storeModel = train_result_dir + '/%s_model' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("GBTRegressor_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc = sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试
        #dataset = dataset[0:1000]

        Y_datavec = dataset[Y_names].values
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form(
            dataset, names_str, names_num, names_show, 'vocabset', 'open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec, normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X, Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num, ret = mlp.GS_PCA(X)
            print 'PCA Information:', pca_num, ret
            print '----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X, ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara, vocabset, ret_num)

        print '--------------Train data shape----------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print 'Y.shape:', Y.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        targets = pd.DataFrame(Y, columns=['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis=1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula='Y ~ .',
                          featuresCol="features",
                          labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],
                                               seed=666)
        #调用模型
        clf_model = dmp.Distr_GBTRegressor(xy_train, xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print '----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show,
                              clf_model, dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara, 'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, datavec_show_list = too.Merge_form(dataset, names_str,
                                                      names_num, names_show,
                                                      vocabset, 'close')
        #数据归一化
        X = too.Data_process(X_datavec, normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X, ret_num)

        print '-------------Pdedict data shape---------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,
                                outputCol='features').transform(raw_features)
        clf_model = GBTRegressionModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model,
                         dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration
Ejemplo n.º 12
0
def load_model(path, classifier='gbt'):
    if classifier == 'gbt':
        return GBTRegressionModel.load(path)  # for gbt
    return Pipeline.load(path)  # for random forest
NoScale_Pca = PCAModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/NoScale_Pca.model')
Scaled_Pca = PCAModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model')

NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select(
    ["og_features", "features"])
Scaled_Pca = Scaled_Pca.transform(scaledData).select(
    ["og_features", "features"])

#Loading models
lr_model = LinearRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model')
dtr_model = DecisionTreeRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model')
gbt_model = GBTRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model')
rf_model = RandomForestRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model')

#Generate prediction
lr_pred = lr_model.transform(NoScale_Pca).select(
    'prediction').collect()[0]['prediction']
dtr_pred = dtr_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
gbt_pred = gbt_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
rfr_pred = rf_model.transform(NoScale_Pca).select(
    'prediction').collect()[0]['prediction']
#Prepare output df to output predictions
output_df = pd.DataFrame()
output_df['Algorithm'] = [
    def prediction(self, predictiveData):

        '''creating duplicate dataset to avoid the datatype change of the original dataset '''
        datasetAdd = predictiveData.get(PredictiveConstants.DATASETADD)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        dataset = spark.read.parquet(datasetAdd)

        # adding extra index column in the dataset
        dataset = PredictiveUtilities.addInternalId(dataset)
        predictiveData.update({
            PredictiveConstants.DATASET: dataset
        })

        etlStats = PredictiveUtilities.performETL(etlInfo=predictiveData)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        originalDataset = etlStats.get(PredictiveConstants.ORIGINALDATASET)

        algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME)
        modelStorageLocation = predictiveData.get(PredictiveConstants.MODELSTORAGELOCATION)
        modelName = predictiveData.get(PredictiveConstants.MODELSHEETNAME)
        datasetName = predictiveData.get(PredictiveConstants.DATASETNAME)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        locationAddress = predictiveData.get(PredictiveConstants.LOCATIONADDRESS)

        if PredictiveConstants.LINEAR_REG.__eq__(algoName) or \
                PredictiveConstants.RIDGE_REG.__eq__(algoName) or PredictiveConstants.LASSO_REG.__eq__(algoName):
            regressionPrediction = LinearRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.RANDOMFORESTALGO.__eq__(algoName):
            regressionPrediction = RandomForestRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.GRADIENTBOOSTALGO.__eq__(algoName):
            regressionPrediction = GBTRegressionModel.load(modelStorageLocation)

        dataset = dataset.drop(modelName)
        originalDataset = originalDataset.drop(modelName)
        dataset = regressionPrediction.transform(dataset)
        dataset = dataset.select(PredictiveConstants.DMXINDEX, modelName)
        finalDataset = originalDataset.join(dataset, on=[PredictiveConstants.DMXINDEX]) \
            .sort(PredictiveConstants.DMXINDEX).drop(PredictiveConstants.DMXINDEX)

        # predictionData = predictionData.drop(featuresColm)
        #
        # #dropping extra added column
        # if indexedFeatures:
        #     indexedFeatures.extend(oneHotEncodedFeaturesList)
        #     predictionData = predictionData.drop(*indexedFeatures)
        # else:
        #     predictionData = predictionData

        # overWriting the original dataset
        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        randomUUID = str(uuid.uuid4())
        fileNameWithPathTemp = locationAddress + randomUUID + datasetName + "_temp.parquet" #correct the name.
        finalDataset.write.parquet(fileNameWithPathTemp, mode="overwrite")  # send this path to java for deletion
        predictionDataReadAgain = spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=datasetName,
                                               locationAddress=locationAddress,
                                               userId=emptyUserId,
                                               data=predictionDataReadAgain)
        return predictionTableData
Ejemplo n.º 15
0
def load_json_and_predict(spark, sqlContext, json_file):

    # Load data to predict
    #predict_df = spark.read.json(JSON_DATA_TO_PREDICT)
    print("Loading prediction data from ", json_file)
    predict_df = spark.read.json(json_file)
    print("Done")

    # Apply same process as historical data to convert/map

    # Drop rows with NA columns
    print("Preprocessing...")
    predict_df_1 = predict_df.dropna()

    predict_df_1 = predict_df_1[
        (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) &
        (predict_df_1.max_item_price > 0) &
        (predict_df_1.total_onshift_runners >= 0) &
        (predict_df_1.total_busy_runners >= 0) &
        (predict_df_1.total_outstanding_orders >= 0) &
        (predict_df_1.estimated_order_place_duration > 0) &
        (predict_df_1.estimated_store_to_consumer_driving_duration > 0) &
        (predict_df_1.market_id != "NA") &
        (predict_df_1.store_primary_category != "NA") &
        (predict_df_1.order_protocol != "NA")]

    udf_rdd_datetimesec_to_sec = fn.udf(
        rdd_datetimesec_to_sec,
        IntegerType())  # LongType() not available for now

    predict_df_1 = predict_df_1.withColumn(
        'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at')))

    # Map store_id string to unique number
    stringindexer = StringIndexer().setInputCol("store_id").setOutputCol(
        "store_id_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    # Map store_primary_category to unique number
    stringindexer = StringIndexer().setInputCol(
        "store_primary_category").setOutputCol("store_primary_category_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    predict_df_1 = predict_df_1.withColumn(
        "market_id", predict_df_1["market_id"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "order_protocol", predict_df_1["order_protocol"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_onshift_runners",
        predict_df_1["total_onshift_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_busy_runners",
        predict_df_1["total_busy_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_outstanding_orders",
        predict_df_1["total_outstanding_orders"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_store_to_consumer_driving_duration",
        predict_df_1["estimated_store_to_consumer_driving_duration"].cast(
            IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "subtotal", predict_df_1["subtotal"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "num_distinct_items",
        predict_df_1["num_distinct_items"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_order_place_duration",
        predict_df_1["estimated_order_place_duration"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_items", predict_df_1["total_items"].cast(IntegerType()))
    print("Done")

    # Use same features as in historical data
    # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price')
    # will be dropped by VectorAssembler transformation

    print("Vectorize...")
    pvectorAssembler = VectorAssembler(inputCols=feature_list,
                                       outputCol='features')
    vectorized_predict_df = pvectorAssembler.transform(predict_df_1)
    vectorized_predict_df = vectorized_predict_df.select(['features'])
    print("Done...")

    txt_file = open(MODEL_NAME_FILE, "r")
    model_name = txt_file.read()
    print("Read model: ", model_name)
    txt_file.close()

    print("Loading model " + model_name + " from " + MODEL_DIR)

    if (model_name == DT_MODEL):
        predict_model = DecisionTreeRegressionModel.load(MODEL_DIR)

    if (model_name == GBT_MODEL):
        predict_model = GBTRegressionModel.load(MODEL_DIR)

    if (model_name == LR_MODEL):
        predict_model = LinearRegressionModel.load(MODEL_DIR)

    if (model_name == RF_MODEL):
        predict_model = RandomForestRegressionModel.load(MODEL_DIR)

    print("Done")

    print("Predicting...")
    model_predictions = predict_model.transform(vectorized_predict_df)
    print("Done")

    df1 = predict_df_1.select('delivery_id').withColumn(
        "id", monotonically_increasing_id())
    df2 = model_predictions.select('prediction').withColumnRenamed(
        'prediction',
        'predicted_delivery_seconds').withColumn("id",
                                                 monotonically_increasing_id())

    # Perform a join on the ids.
    prediction_results_df = df1.join(df2, "id", "left").drop("id")
    prediction_results_df = prediction_results_df.withColumn(
        "predicted_delivery_seconds",
        prediction_results_df["predicted_delivery_seconds"].cast(
            IntegerType()))

    return prediction_results_df
Ejemplo n.º 16
0
    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
    elif algoName == "GBTRegression":
        from pyspark.ml.regression import GBTRegressionModel
        model = GBTRegressionModel.load(modelPath)

    #predict
    prediction = model.transform(data).select("prediction")

    #save
    prediction.write.format("csv").save(outputPath)
            "jar_files/commons-pool2-2.6.2.jar") \
        .getOrCreate()

    # Set number of output partitions
    spark.conf.set("spark.sql.shuffle.partitions", 5)

    # Set log level
    spark.sparkContext.setLogLevel("ERROR")

    target_cols = norm_params['target']
    models = {}

    for target in target_cols:

        model = [model for model in model_params if target in model]
        models[target] = GBTRegressionModel.load(model[0])

else:
    raise TypeError("Unrecognized model")

if not isinstance(model_params, list):

    if model_params.endswith('.pt') or 'sklearn' in model_params:

        for message in consumer:

            df = pd.read_json(message.value)

            timeAtServer = float(df.timeAtServer)
            aircraft = int(df.aircraft)