def save_load_model():
    """Interpret model results, save and load model"""
    # (1) Interpreting results
    """It is almost always important to know which features are influencing your prediction the most. Perhaps its counterintuitive and that's an insight? Perhaps a hand full of features account for most of the accuracy of your model and you don't need to perform time acquiring or massaging other features.

    In this example we will be looking at a model that has been trained without any LISTPRICE information. With that gone, what influences the price the most?"""
    # Convert feature importances to a pandas column
    #importances = model.featureImportances.toArray()
    fi_df = pd.DataFrame(importances, columns=['importance'])

    # Convert list of feature names to pandas column
    fi_df['feature'] = pd.Series(feature_cols)

    # Sort the data based on feature importance
    fi_df.sort_values(by=['importance'], ascending=False, inplace=True)

    # Inspect Results
    fi_df.head(10)
    #     importance             feature
    # 36    0.256598          SQFT_TOTAL
    # 4     0.212320               TAXES
    # 6     0.166661          LIVINGAREA
    # ...

    # (2) Saving and loading models
    """Often times you may find yourself going back to a previous model to see what assumptions or settings were used when diagnosing where your prediction errors were coming from. Perhaps there was something wrong with the data? Maybe you need to incorporate a new feature to capture an unusual event that occurred?"""
    from pyspark.ml.regression import RandomForestRegressionModel

    # Save model
    model.save('rfr_no_listprice')

    # Load model
    loaded_model = RandomForestRegressionModel.load('rfr_no_listprice')
    def getOrCreateRFR (self):
        try:
            if self.rfrModel == None:
                self.rfrModel = RandomForestRegressionModel.load(CONST_RFR_FILE)
        except :
            print("Creating RFR Model")
            self.rfrModel = self.createRFR ()

        return self.rfrModel
Example #3
0
 def load(self, load_dir):
     if os.path.isdir(load_dir):
         if self.pm == 'PM10':
             self.model = LinearRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         else:
             self.model = RandomForestRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         self.imputer = ImputerModel.load(os.path.join(load_dir, 'imputer'))
         self.assembler = VectorAssembler.load(
             os.path.join(load_dir, 'assembler'))
     else:
         raise RuntimeError(
             'Save path: {}, does not exist or is not a directory'.format(
                 load_dir))
 def __getBasePredictors(self, num=5):
     res = []
     if not os.path.exists(self._predictorPath):
         os.mkdir(self._predictorPath)
     if os.listdir(self._predictorPath):
         for i in range(num):
             res.append(
                 RandomForestRegressionModel.load(
                     self._predictorModelPath.format(i)))
     else:
         for i in range(num):
             model = self.__getBasePredictor(i)
             res.append(model)
             model\
                 .write()\
                 .overwrite().\
                 save(self._predictorModelPath.format(i))
     return res
Example #5
0
def rfRegressor(df):
    df = df.withColumn('tmp_price', df['price'])
    df = df.drop('price')
    df = df.withColumnRenamed('tmp_price', 'price')

    feature_label = df.rdd.map(lambda x: (Vectors.dense(
        [float(i) for i in x[0:-1]]), float(x[-1]))).toDF(
            ["features", "label"])

    (trainingData, testData) = feature_label.randomSplit([0.7, 0.3])

    rf = RandomForestRegressor()

    model = rf.fit(trainingData)

    importance_map_df = importance_features_map(df, model, 'price')

    # Make predictions.
    predictions = model.transform(testData)
    predict_df = predictions.select("prediction", "label")
    predict_df = predict_df.withColumn(
        'rate',
        (predict_df['prediction'] - predict_df['label']) / predict_df['label'])

    def udf_rate(s):
        return round(abs(s), 3)

    udf_rate = udf(udf_rate)

    predict_df = predict_df.select(
        '*',
        udf_rate(predict_df['rate']).alias('rates')).drop('rate')

    predict_df.show()

    model.save("/root/myModelPath1")
    sameModel = RandomForestRegressionModel.load("/root/myModelPath1")

    same_predict_df = sameModel.transform(testData)
    print('=======================================')
    same_predict_df.show()

    return importance_map_df, model
Example #6
0
    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData
Example #7
0
def RandomForestRegressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
    model = rf.fit(df)
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    allclose(model.treeWeights, [1.0, 1.0])
    # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    model.numFeatures
    # 1
    model.trees
    # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]
    model.getNumTrees
    # 2
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 0.5
    temp_path = "./"
    rfr_path = temp_path + "/rfr"
    rf.save(rfr_path)
    rf2 = RandomForestRegressor.load(rfr_path)
    rf2.getNumTrees()
    # 2
    model_path = temp_path + "/rfr_model"
    model.save(model_path)
    model2 = RandomForestRegressionModel.load(model_path)
    model.featureImportances == model2.featureImportances
Example #8
0
def predict_data(spark, logger, model_path, data):
    '''
        preprocess without pipeline
    '''
    from pyspark.ml.regression import RandomForestRegressionModel
    from project.schema import get_pred_schema
    try:
        assert len(data) > 0, 'empty data'
        logger.info("{} rows".format(len(data)))

        # create spark dataframe
        spark_data = spark.createDataFrame(data, get_pred_schema())

        # preprocessing
        preprocess_data = preprocess(spark_data)

        # load and predict
        m = RandomForestRegressionModel.load(model_path)
        pred = m.transform(preprocess_data)
        return [p['prediction'] for p in pred.collect()]

    except Exception:
        logger.error(traceback.print_exc())
        return None
Example #9
0
    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
    elif algoName == "GBTRegression":
        from pyspark.ml.regression import GBTRegressionModel
        model = GBTRegressionModel.load(modelPath)

    #predict
    prediction = model.transform(data).select("prediction")

    #save
    prediction.write.format("csv").save(outputPath)
Example #10
0

# Convert feature importances to a pandas column
fi_df = pd.DataFrame(importances, columns=['importance'])

# Convert list of feature names to pandas column
fi_df['feature'] = pd.Series(feature_cols)

# Sort the data based on feature importance
fi_df.sort_values(by=['importance'], ascending=False, inplace=True)

# Inspect Results
fi_df.head(10)



Saving & Loading Models
Often times you may find yourself going back to a previous model to see what assumptions or settings were used when diagnosing where your prediction errors were coming from. Perhaps there was something wrong with the data? Maybe you need to incorporate a new feature to capture an unusual event that occurred?

In this example, you will practice saving and loading a model.


from pyspark.ml.regression import RandomForestRegressionModel

# Save model
model.save('rfr_no_listprice')

# Load model
loaded_model = RandomForestRegressionModel.load('rfr_no_listprice')

Example #11
0
def load_json_and_predict(spark, sqlContext, json_file):

    # Load data to predict
    #predict_df = spark.read.json(JSON_DATA_TO_PREDICT)
    print("Loading prediction data from ", json_file)
    predict_df = spark.read.json(json_file)
    print("Done")

    # Apply same process as historical data to convert/map

    # Drop rows with NA columns
    print("Preprocessing...")
    predict_df_1 = predict_df.dropna()

    predict_df_1 = predict_df_1[
        (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) &
        (predict_df_1.max_item_price > 0) &
        (predict_df_1.total_onshift_runners >= 0) &
        (predict_df_1.total_busy_runners >= 0) &
        (predict_df_1.total_outstanding_orders >= 0) &
        (predict_df_1.estimated_order_place_duration > 0) &
        (predict_df_1.estimated_store_to_consumer_driving_duration > 0) &
        (predict_df_1.market_id != "NA") &
        (predict_df_1.store_primary_category != "NA") &
        (predict_df_1.order_protocol != "NA")]

    udf_rdd_datetimesec_to_sec = fn.udf(
        rdd_datetimesec_to_sec,
        IntegerType())  # LongType() not available for now

    predict_df_1 = predict_df_1.withColumn(
        'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at')))

    # Map store_id string to unique number
    stringindexer = StringIndexer().setInputCol("store_id").setOutputCol(
        "store_id_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    # Map store_primary_category to unique number
    stringindexer = StringIndexer().setInputCol(
        "store_primary_category").setOutputCol("store_primary_category_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    predict_df_1 = predict_df_1.withColumn(
        "market_id", predict_df_1["market_id"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "order_protocol", predict_df_1["order_protocol"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_onshift_runners",
        predict_df_1["total_onshift_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_busy_runners",
        predict_df_1["total_busy_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_outstanding_orders",
        predict_df_1["total_outstanding_orders"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_store_to_consumer_driving_duration",
        predict_df_1["estimated_store_to_consumer_driving_duration"].cast(
            IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "subtotal", predict_df_1["subtotal"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "num_distinct_items",
        predict_df_1["num_distinct_items"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_order_place_duration",
        predict_df_1["estimated_order_place_duration"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_items", predict_df_1["total_items"].cast(IntegerType()))
    print("Done")

    # Use same features as in historical data
    # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price')
    # will be dropped by VectorAssembler transformation

    print("Vectorize...")
    pvectorAssembler = VectorAssembler(inputCols=feature_list,
                                       outputCol='features')
    vectorized_predict_df = pvectorAssembler.transform(predict_df_1)
    vectorized_predict_df = vectorized_predict_df.select(['features'])
    print("Done...")

    txt_file = open(MODEL_NAME_FILE, "r")
    model_name = txt_file.read()
    print("Read model: ", model_name)
    txt_file.close()

    print("Loading model " + model_name + " from " + MODEL_DIR)

    if (model_name == DT_MODEL):
        predict_model = DecisionTreeRegressionModel.load(MODEL_DIR)

    if (model_name == GBT_MODEL):
        predict_model = GBTRegressionModel.load(MODEL_DIR)

    if (model_name == LR_MODEL):
        predict_model = LinearRegressionModel.load(MODEL_DIR)

    if (model_name == RF_MODEL):
        predict_model = RandomForestRegressionModel.load(MODEL_DIR)

    print("Done")

    print("Predicting...")
    model_predictions = predict_model.transform(vectorized_predict_df)
    print("Done")

    df1 = predict_df_1.select('delivery_id').withColumn(
        "id", monotonically_increasing_id())
    df2 = model_predictions.select('prediction').withColumnRenamed(
        'prediction',
        'predicted_delivery_seconds').withColumn("id",
                                                 monotonically_increasing_id())

    # Perform a join on the ids.
    prediction_results_df = df1.join(df2, "id", "left").drop("id")
    prediction_results_df = prediction_results_df.withColumn(
        "predicted_delivery_seconds",
        prediction_results_df["predicted_delivery_seconds"].cast(
            IntegerType()))

    return prediction_results_df
Example #12
0
df_all_dates = df_all_dates.withColumn("PDT", col("DateTime").cast(LongType()) - col("PDT").cast(LongType())) \
    .withColumn("SPDT", col("DateTime").cast(LongType()) - col("SPDT").cast(LongType())) \
    .withColumn("NDT", col("NDT").cast(LongType()) - col("DateTime").cast(LongType())) \
    .withColumn("SNDT", col("SNDT").cast(LongType()) - col("DateTime").cast(LongType())) \
    .withColumn("TPDT", col("DateTime").cast(LongType()) - col("TPDT").cast(LongType())) \
    .withColumn("TNDT", col("TNDT").cast(LongType()) - col("DateTime").cast(LongType()))

res_file = "/regression/all_dates.csv"
df_all_dates.repartition(1).write.mode("overwrite").option("header", "true").option("sep", "|").csv(
    HDFS_NAMENODE + res_file)
# ---------------------------------------------------------------------------------------------------------------------
res_file = "/regression/all_dates.csv"

df_all_dates = spark.read.option("sep", "|").option("header", "true").csv(
    HDFS_NAMENODE + res_file)
model = RandomForestRegressionModel.load(HDFS_NAMENODE + "/models/rf_model.model")
print("model loaded")
interpol_udf = udf(interpol, FloatType())
required_features = [
    'Hour',
    'PGV',
    'PDT',
    'NDT',
    'NGV',
    'SPDT',
    'SPGV',
    'SNDT',
    'SNGV',
    'TPDT',
    'TPGV',
    'TNDT',
Scaled_Pca = PCAModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model')

NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select(
    ["og_features", "features"])
Scaled_Pca = Scaled_Pca.transform(scaledData).select(
    ["og_features", "features"])

#Loading models
lr_model = LinearRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model')
dtr_model = DecisionTreeRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model')
gbt_model = GBTRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model')
rf_model = RandomForestRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model')

#Generate prediction
lr_pred = lr_model.transform(NoScale_Pca).select(
    'prediction').collect()[0]['prediction']
dtr_pred = dtr_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
gbt_pred = gbt_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
rfr_pred = rf_model.transform(NoScale_Pca).select(
    'prediction').collect()[0]['prediction']
#Prepare output df to output predictions
output_df = pd.DataFrame()
output_df['Algorithm'] = [
    'Linear Regression', 'Decision Tree', 'Gradient Boosted Tree',
    'Random Forest'
Example #14
0
def basicPredictionPipeline(data,
                            col_target="",
                            first_pred_day=False,
                            dt_execution=False,
                            jarra='quinto',
                            logger=False,
                            verbose=True,
                            checks=False):
    try:

        start_all = datetime.now()

        # Get parameters from config file
        number_of_models = 6
        parser = SafeConfigParser()
        parser.read(MODEL_CONFIG_FILE)
        local_save_path = parser.get('save_params', 'local_save_path')
        if not os.path.exists(local_save_path):
            os.makedirs(local_save_path)
        local_save_path = parser.get('save_params', 'local_save_path')

        # Define name of the variable for predictions
        cols_cyclical, cols_ohe_in, cols_features, col_target, cols_id = defineFeatures(
            model_complex='first',
            use_clustered_data_sets=False,
            col_target=col_target,
            verbose=False,
            logger=False)

        cols_ohe_out = [s + '_catVec' for s in cols_ohe_in]
        if first_pred_day is not None:
            split_value = first_pred_day
        else:
            split_value = datetime.today()
            first_pred_day = split_value.strftime('%Y-%m-%d')
            split_value = split_value.strftime('%Y-%m-%d')
        if not dt_execution:
            dt_execution = split_value
        s3_save_path = parser.get('save_params', 's3_save_path')
        s3_save_pipelines_path = s3_save_path + 'pipelines/' + col_target + '/dt-execution=' + dt_execution + '/'

        # Connect to spark session
        spark = createSparkSession(jarra='mass', verbose=True, logger=logger)

        # Load data prep and model pipelines from S3 for model training run on dt_execution:
        if verbose:
            logger.info(
                'Loading data preparation and model pipelines lists from ' +
                s3_save_pipelines_path)

        pipelinePrepList = []
        fitList = []
        for i in range(number_of_models):
            pipelinePrepList.append(
                PipelineModel.read().load(s3_save_pipelines_path +
                                          "data_prep_pipeline" + str(i)))
            fitList.append(RandomForestRegressionModel.read().load(
                s3_save_pipelines_path + "model_pipeline" + str(i)))
        if verbose:
            logger.info(
                'Loading data preparation and model pipelines lists end')

        # Add cyclical variables to features lists, OHE_out not as they are already in pipelines
        cols_cyclical_sin = [s + '_sin' for s in cols_cyclical]
        cols_cyclical_cos = [s + '_cos' for s in cols_cyclical]
        cols_cyclical_out = cols_cyclical_sin + cols_cyclical_cos

        for i in range(len(cols_features)):
            cols_features[i] = cols_features[i] + cols_cyclical_out

        # Create list with start and end dates for each of consecutive models
        start_days_list, end_days_list = createTestDatesListWFV(
            split_value, verbose=verbose, logger=logger)

        # Define date filters for test/pred sets of each consecutive models
        filterPredStartList = []
        filterPredEndList = []
        for i in range(len(start_days_list)):
            filterPredStartList.append(
                col('dt_flight_date_local') >= start_days_list[i])
            filterPredEndList.append(
                col('dt_flight_date_local') <= end_days_list[i])

        # Create list with test data sets for each of the consecutive models, each data set have different features
        # and dates, also data list for rows/flights with Nulls (e.g. no historical data) is created separately
        test_data_list, test_data_basic_list = createTestDataLists(
            data, cols_features, cols_ohe_in, col_target, cols_id,
            filterPredStartList, filterPredEndList, spark, verbose, logger)

        # Transform string idexer, ohe, vector assembler using pipeline from training
        if verbose:
            logger.info(
                'String indexer, one hot encoder and vector assembler test sets, start'
            )

        testDataList = []
        testDataBasicList = []
        for i in range(len(test_data_list)):
            if verbose:
                logger.info('Model ' + str(i))
            testDataList.append(pipelinePrepList[i].transform(
                test_data_list[i]))
        if verbose:
            logger.info('RF Model start')

        # Apply RF model data using pipeline from training
        resultsList = []
        resultsBasicList = []
        for i in range(len(testDataList)):
            # Use the test set, is creating an extra column 'col_target' with the test fit results
            resultsList.append(fitList[i].transform(
                testDataList[i]).select(cols_id + [col_target + '_pred']))
        if verbose:
            logger.info('RF Model end')

        # Union dataframes with results for each model as one dataframe (to get the full results)
        resultsFull = resultsList[0]
        resultsFull = resultsFull.union(resultsBasicList[0])
        for i in range(1, len(test_data_list)):
            resultsFull = resultsFull.union(resultsList[i])
            resultsFull = resultsFull.union(resultsBasicList[i])
        resultsFull.cache()
        resultsFull = resultsFull.withColumn('dt_flight_date_local',
                                             to_date('dt_flight_date_local'))

        # Add execution date column
        resultsFull = resultsFull.withColumn('dt_execution',
                                             lit(first_pred_day))
        resultsFull = resultsFull.withColumn('dt_execution',
                                             to_date('dt_execution'))

        # Save prediction results in local for each model seperately
        if verbose:
            logger.info('Changing data frame to Pandas to save in local')
        model_results = resultsFull.toPandas()
        if not os.path.isdir(local_save_path):
            os.mkdir(local_save_path)
        model_results\
        .to_csv(local_save_path + col_target + '_results_' + first_pred_day.replace('-', '_') + '.csv', index=False)
        if verbose:
            logger.info('Results saved in: ' + local_save_path + col_target +
                        '_results_' + first_pred_day.replace('-', '_') +
                        '.csv')

        # Get feature importances
        featureImportancesFirst, featureImportancesLast, feature_importances_all = calcFeatImportance(
            fitList, testDataList, col_target, first_pred_day, verbose, logger)

        # Save feature importance for given target variable
        feature_importances_all.\
        to_csv(local_save_path + col_target + '_feat_importance_' + first_pred_day.replace('-', '_') + '.csv', index=False)
        end_all = datetime.now()
        if verbose:
            logger.info('Random Forest, all models, time: ' +
                        str(end_all - start_all))
            logger.info('Feature importance saved in: ' + local_save_path +
                        col_target + '_feat_importance_' +
                        first_pred_day.replace('-', '_') + '.csv')
            logger.info(
                'Check sum of predicted variables per month and count of flights each month: '
            )

        # Calculate metrics for mlflow
        if verbose and checks:
            df_prediction_errors, pred_errors = calcTrainingSetError(
                number_of_last_days_to_eval=90,
                last_dt_exec_to_evaluate=False,
                list_exec_dates_to_evalute=False,
                remove_outliers=True,
                verbose=True,
                logger=logger,
                checks=True)

            checkDuplicates = resultsFull.drop_duplicates(subset=['dt_flight_date_local', 'cd_num_flight', 'cd_airport_pair', 'cd_carrier'])\
                              .count() - resultsFull.count()
            resultsFullCount = resultsFull.count()

            # Count sum of rows in all test sets
            testSetCount = np.sum(
                [testDataList[i].count() for i in range(len(testDataList))])
            testBasicSetCount = np.sum([
                testDataBasicList[i].count()
                for i in range(len(testDataBasicList))
            ])

            logger.info('Sum of flights per month (real values): ')
            logger.info(
                resultsFull.groupBy("dt_flight_year_month").agg(
                    count("cd_airport_pair")).sort(
                        "dt_flight_year_month").toPandas())
            logger.info('Sum of predicted ' + col_predict +
                        ' per month (all flights): ')
            logger.info(
                resultsFull.groupBy("dt_flight_year_month").agg(
                    sum(col_predict)).sort("dt_flight_year_month").toPandas())
            logger.info('Number of duplicated flights: ')

            logger.info('Number of rows/flights in test sets: ' +
                        str(testSetCount))
            logger.info('Number of rows/flights in basic model test sets: ' +
                        str(testBasicSetCount))
            logger.info('Number of flights/rows in prediction set:')
            logger.info(resultsFullCount)
            logger.info(
                'Feature importances for the first model (flights this week):')
            logger.info(featureImportancesFirst)
            logger.info('Feature importances for the last model:')
            logger.info(featureImportancesLast)
            mlflow_params = {
                'checkDuplicates':
                checkDuplicates,
                'resultsFullCount':
                resultsFullCount,
                'testSetCount':
                testSetCount,
                'testBasicSetCount':
                testBasicSetCount,
                'predDateMin':
                str(resultsFull.toPandas().dt_flight_date_local.min()),
                'predDateMax':
                str(resultsFull.toPandas().dt_flight_date_local.max()),
                'time_seconds': (end_all - start_all).total_seconds()
            }
        else:
            mlflow_params = {}
        #spark.stop()
        #if verbose:
        #    logger.info('Spark Session stopped')
    except Exception:
        logger.exception("Fatal error in demand_forecast_pred()")
        raise
    return (mlflow_params, pred_errors)
    def prediction(self, predictiveData):

        '''creating duplicate dataset to avoid the datatype change of the original dataset '''
        datasetAdd = predictiveData.get(PredictiveConstants.DATASETADD)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        dataset = spark.read.parquet(datasetAdd)

        # adding extra index column in the dataset
        dataset = PredictiveUtilities.addInternalId(dataset)
        predictiveData.update({
            PredictiveConstants.DATASET: dataset
        })

        etlStats = PredictiveUtilities.performETL(etlInfo=predictiveData)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        originalDataset = etlStats.get(PredictiveConstants.ORIGINALDATASET)

        algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME)
        modelStorageLocation = predictiveData.get(PredictiveConstants.MODELSTORAGELOCATION)
        modelName = predictiveData.get(PredictiveConstants.MODELSHEETNAME)
        datasetName = predictiveData.get(PredictiveConstants.DATASETNAME)
        spark = predictiveData.get(PredictiveConstants.SPARK)
        locationAddress = predictiveData.get(PredictiveConstants.LOCATIONADDRESS)

        if PredictiveConstants.LINEAR_REG.__eq__(algoName) or \
                PredictiveConstants.RIDGE_REG.__eq__(algoName) or PredictiveConstants.LASSO_REG.__eq__(algoName):
            regressionPrediction = LinearRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.RANDOMFORESTALGO.__eq__(algoName):
            regressionPrediction = RandomForestRegressionModel.load(modelStorageLocation)
        if PredictiveConstants.GRADIENTBOOSTALGO.__eq__(algoName):
            regressionPrediction = GBTRegressionModel.load(modelStorageLocation)

        dataset = dataset.drop(modelName)
        originalDataset = originalDataset.drop(modelName)
        dataset = regressionPrediction.transform(dataset)
        dataset = dataset.select(PredictiveConstants.DMXINDEX, modelName)
        finalDataset = originalDataset.join(dataset, on=[PredictiveConstants.DMXINDEX]) \
            .sort(PredictiveConstants.DMXINDEX).drop(PredictiveConstants.DMXINDEX)

        # predictionData = predictionData.drop(featuresColm)
        #
        # #dropping extra added column
        # if indexedFeatures:
        #     indexedFeatures.extend(oneHotEncodedFeaturesList)
        #     predictionData = predictionData.drop(*indexedFeatures)
        # else:
        #     predictionData = predictionData

        # overWriting the original dataset
        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        randomUUID = str(uuid.uuid4())
        fileNameWithPathTemp = locationAddress + randomUUID + datasetName + "_temp.parquet" #correct the name.
        finalDataset.write.parquet(fileNameWithPathTemp, mode="overwrite")  # send this path to java for deletion
        predictionDataReadAgain = spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=datasetName,
                                               locationAddress=locationAddress,
                                               userId=emptyUserId,
                                               data=predictionDataReadAgain)
        return predictionTableData
Example #16
0
def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'Distr_RandomForestReg'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)
    dir_of_storeModel = train_result_dir + '/%s_model' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("RandomForestReg_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc = sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试
        #dataset = dataset[0:1000]

        Y_datavec = dataset[Y_names].values
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form(
            dataset, names_str, names_num, names_show, 'vocabset', 'open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec, normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X, Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num, ret = mlp.GS_PCA(X)
            print 'PCA Information:', pca_num, ret
            print '----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X, ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara, vocabset, ret_num)

        print '--------------Train data shape----------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print 'Y.shape:', Y.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        targets = pd.DataFrame(Y, columns=['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis=1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula='Y ~ .',
                          featuresCol="features",
                          labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],
                                               seed=666)
        #调用模型
        clf_model = dmp.Distr_RandomForestRegressor(xy_train, xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print '----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show,
                              clf_model, dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara, 'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据,且合并
        X_datavec, datavec_show_list = too.Merge_form(dataset, names_str,
                                                      names_num, names_show,
                                                      vocabset, 'close')
        #数据归一化
        X = too.Data_process(X_datavec, normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X, ret_num)

        print '-------------Pdedict data shape---------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,
                                outputCol='features').transform(raw_features)
        clf_model = RandomForestRegressionModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model,
                         dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration