def save_load_model(): """Interpret model results, save and load model""" # (1) Interpreting results """It is almost always important to know which features are influencing your prediction the most. Perhaps its counterintuitive and that's an insight? Perhaps a hand full of features account for most of the accuracy of your model and you don't need to perform time acquiring or massaging other features. In this example we will be looking at a model that has been trained without any LISTPRICE information. With that gone, what influences the price the most?""" # Convert feature importances to a pandas column #importances = model.featureImportances.toArray() fi_df = pd.DataFrame(importances, columns=['importance']) # Convert list of feature names to pandas column fi_df['feature'] = pd.Series(feature_cols) # Sort the data based on feature importance fi_df.sort_values(by=['importance'], ascending=False, inplace=True) # Inspect Results fi_df.head(10) # importance feature # 36 0.256598 SQFT_TOTAL # 4 0.212320 TAXES # 6 0.166661 LIVINGAREA # ... # (2) Saving and loading models """Often times you may find yourself going back to a previous model to see what assumptions or settings were used when diagnosing where your prediction errors were coming from. Perhaps there was something wrong with the data? Maybe you need to incorporate a new feature to capture an unusual event that occurred?""" from pyspark.ml.regression import RandomForestRegressionModel # Save model model.save('rfr_no_listprice') # Load model loaded_model = RandomForestRegressionModel.load('rfr_no_listprice')
def getOrCreateRFR (self): try: if self.rfrModel == None: self.rfrModel = RandomForestRegressionModel.load(CONST_RFR_FILE) except : print("Creating RFR Model") self.rfrModel = self.createRFR () return self.rfrModel
def load(self, load_dir): if os.path.isdir(load_dir): if self.pm == 'PM10': self.model = LinearRegressionModel.load( os.path.join(load_dir, 'model')) else: self.model = RandomForestRegressionModel.load( os.path.join(load_dir, 'model')) self.imputer = ImputerModel.load(os.path.join(load_dir, 'imputer')) self.assembler = VectorAssembler.load( os.path.join(load_dir, 'assembler')) else: raise RuntimeError( 'Save path: {}, does not exist or is not a directory'.format( load_dir))
def __getBasePredictors(self, num=5): res = [] if not os.path.exists(self._predictorPath): os.mkdir(self._predictorPath) if os.listdir(self._predictorPath): for i in range(num): res.append( RandomForestRegressionModel.load( self._predictorModelPath.format(i))) else: for i in range(num): model = self.__getBasePredictor(i) res.append(model) model\ .write()\ .overwrite().\ save(self._predictorModelPath.format(i)) return res
def rfRegressor(df): df = df.withColumn('tmp_price', df['price']) df = df.drop('price') df = df.withColumnRenamed('tmp_price', 'price') feature_label = df.rdd.map(lambda x: (Vectors.dense( [float(i) for i in x[0:-1]]), float(x[-1]))).toDF( ["features", "label"]) (trainingData, testData) = feature_label.randomSplit([0.7, 0.3]) rf = RandomForestRegressor() model = rf.fit(trainingData) importance_map_df = importance_features_map(df, model, 'price') # Make predictions. predictions = model.transform(testData) predict_df = predictions.select("prediction", "label") predict_df = predict_df.withColumn( 'rate', (predict_df['prediction'] - predict_df['label']) / predict_df['label']) def udf_rate(s): return round(abs(s), 3) udf_rate = udf(udf_rate) predict_df = predict_df.select( '*', udf_rate(predict_df['rate']).alias('rates')).drop('rate') predict_df.show() model.save("/root/myModelPath1") sameModel = RandomForestRegressionModel.load("/root/myModelPath1") same_predict_df = sameModel.transform(testData) print('=======================================') same_predict_df.show() return importance_map_df, model
def loadModel(self): if self.algoName == "linear_reg" or self.algoName == \ "ridge_reg" or self.algoName == "lasso_reg" : regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation) if self.algoName == "RandomForestAlgo" : regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation) if self.algoName == "GradientBoostAlgo": regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation) #dropping the already existed column of prediction on same model self.dataset = self.dataset.drop(self.modelSheetName) predictionData = regressionPrediction.transform(self.dataset) predictionData = predictionData.drop(self.featuresColm) #dropping extra added column if self.indexedFeatures: self.indexedFeatures.extend(self.oneHotEncodedFeaturesList) predictionData = predictionData.drop(*self.indexedFeatures) else: predictionData = predictionData #overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet" predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite") predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=self.datasetName, locationAddress=self.locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def RandomForestRegressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) model = rf.fit(df) model.featureImportances # SparseVector(1, {0: 1.0}) allclose(model.treeWeights, [1.0, 1.0]) # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 model.numFeatures # 1 model.trees # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] model.getNumTrees # 2 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 0.5 temp_path = "./" rfr_path = temp_path + "/rfr" rf.save(rfr_path) rf2 = RandomForestRegressor.load(rfr_path) rf2.getNumTrees() # 2 model_path = temp_path + "/rfr_model" model.save(model_path) model2 = RandomForestRegressionModel.load(model_path) model.featureImportances == model2.featureImportances
def predict_data(spark, logger, model_path, data): ''' preprocess without pipeline ''' from pyspark.ml.regression import RandomForestRegressionModel from project.schema import get_pred_schema try: assert len(data) > 0, 'empty data' logger.info("{} rows".format(len(data))) # create spark dataframe spark_data = spark.createDataFrame(data, get_pred_schema()) # preprocessing preprocess_data = preprocess(spark_data) # load and predict m = RandomForestRegressionModel.load(model_path) pred = m.transform(preprocess_data) return [p['prediction'] for p in pred.collect()] except Exception: logger.error(traceback.print_exc()) return None
#load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath) elif algoName == "GBTRegression": from pyspark.ml.regression import GBTRegressionModel model = GBTRegressionModel.load(modelPath) #predict prediction = model.transform(data).select("prediction") #save prediction.write.format("csv").save(outputPath)
# Convert feature importances to a pandas column fi_df = pd.DataFrame(importances, columns=['importance']) # Convert list of feature names to pandas column fi_df['feature'] = pd.Series(feature_cols) # Sort the data based on feature importance fi_df.sort_values(by=['importance'], ascending=False, inplace=True) # Inspect Results fi_df.head(10) Saving & Loading Models Often times you may find yourself going back to a previous model to see what assumptions or settings were used when diagnosing where your prediction errors were coming from. Perhaps there was something wrong with the data? Maybe you need to incorporate a new feature to capture an unusual event that occurred? In this example, you will practice saving and loading a model. from pyspark.ml.regression import RandomForestRegressionModel # Save model model.save('rfr_no_listprice') # Load model loaded_model = RandomForestRegressionModel.load('rfr_no_listprice')
def load_json_and_predict(spark, sqlContext, json_file): # Load data to predict #predict_df = spark.read.json(JSON_DATA_TO_PREDICT) print("Loading prediction data from ", json_file) predict_df = spark.read.json(json_file) print("Done") # Apply same process as historical data to convert/map # Drop rows with NA columns print("Preprocessing...") predict_df_1 = predict_df.dropna() predict_df_1 = predict_df_1[ (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) & (predict_df_1.max_item_price > 0) & (predict_df_1.total_onshift_runners >= 0) & (predict_df_1.total_busy_runners >= 0) & (predict_df_1.total_outstanding_orders >= 0) & (predict_df_1.estimated_order_place_duration > 0) & (predict_df_1.estimated_store_to_consumer_driving_duration > 0) & (predict_df_1.market_id != "NA") & (predict_df_1.store_primary_category != "NA") & (predict_df_1.order_protocol != "NA")] udf_rdd_datetimesec_to_sec = fn.udf( rdd_datetimesec_to_sec, IntegerType()) # LongType() not available for now predict_df_1 = predict_df_1.withColumn( 'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at'))) # Map store_id string to unique number stringindexer = StringIndexer().setInputCol("store_id").setOutputCol( "store_id_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) # Map store_primary_category to unique number stringindexer = StringIndexer().setInputCol( "store_primary_category").setOutputCol("store_primary_category_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) predict_df_1 = predict_df_1.withColumn( "market_id", predict_df_1["market_id"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "order_protocol", predict_df_1["order_protocol"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_onshift_runners", predict_df_1["total_onshift_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_busy_runners", predict_df_1["total_busy_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_outstanding_orders", predict_df_1["total_outstanding_orders"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_store_to_consumer_driving_duration", predict_df_1["estimated_store_to_consumer_driving_duration"].cast( IntegerType())) predict_df_1 = predict_df_1.withColumn( "subtotal", predict_df_1["subtotal"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "num_distinct_items", predict_df_1["num_distinct_items"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_order_place_duration", predict_df_1["estimated_order_place_duration"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_items", predict_df_1["total_items"].cast(IntegerType())) print("Done") # Use same features as in historical data # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price') # will be dropped by VectorAssembler transformation print("Vectorize...") pvectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='features') vectorized_predict_df = pvectorAssembler.transform(predict_df_1) vectorized_predict_df = vectorized_predict_df.select(['features']) print("Done...") txt_file = open(MODEL_NAME_FILE, "r") model_name = txt_file.read() print("Read model: ", model_name) txt_file.close() print("Loading model " + model_name + " from " + MODEL_DIR) if (model_name == DT_MODEL): predict_model = DecisionTreeRegressionModel.load(MODEL_DIR) if (model_name == GBT_MODEL): predict_model = GBTRegressionModel.load(MODEL_DIR) if (model_name == LR_MODEL): predict_model = LinearRegressionModel.load(MODEL_DIR) if (model_name == RF_MODEL): predict_model = RandomForestRegressionModel.load(MODEL_DIR) print("Done") print("Predicting...") model_predictions = predict_model.transform(vectorized_predict_df) print("Done") df1 = predict_df_1.select('delivery_id').withColumn( "id", monotonically_increasing_id()) df2 = model_predictions.select('prediction').withColumnRenamed( 'prediction', 'predicted_delivery_seconds').withColumn("id", monotonically_increasing_id()) # Perform a join on the ids. prediction_results_df = df1.join(df2, "id", "left").drop("id") prediction_results_df = prediction_results_df.withColumn( "predicted_delivery_seconds", prediction_results_df["predicted_delivery_seconds"].cast( IntegerType())) return prediction_results_df
df_all_dates = df_all_dates.withColumn("PDT", col("DateTime").cast(LongType()) - col("PDT").cast(LongType())) \ .withColumn("SPDT", col("DateTime").cast(LongType()) - col("SPDT").cast(LongType())) \ .withColumn("NDT", col("NDT").cast(LongType()) - col("DateTime").cast(LongType())) \ .withColumn("SNDT", col("SNDT").cast(LongType()) - col("DateTime").cast(LongType())) \ .withColumn("TPDT", col("DateTime").cast(LongType()) - col("TPDT").cast(LongType())) \ .withColumn("TNDT", col("TNDT").cast(LongType()) - col("DateTime").cast(LongType())) res_file = "/regression/all_dates.csv" df_all_dates.repartition(1).write.mode("overwrite").option("header", "true").option("sep", "|").csv( HDFS_NAMENODE + res_file) # --------------------------------------------------------------------------------------------------------------------- res_file = "/regression/all_dates.csv" df_all_dates = spark.read.option("sep", "|").option("header", "true").csv( HDFS_NAMENODE + res_file) model = RandomForestRegressionModel.load(HDFS_NAMENODE + "/models/rf_model.model") print("model loaded") interpol_udf = udf(interpol, FloatType()) required_features = [ 'Hour', 'PGV', 'PDT', 'NDT', 'NGV', 'SPDT', 'SPGV', 'SNDT', 'SNGV', 'TPDT', 'TPGV', 'TNDT',
Scaled_Pca = PCAModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model') NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select( ["og_features", "features"]) Scaled_Pca = Scaled_Pca.transform(scaledData).select( ["og_features", "features"]) #Loading models lr_model = LinearRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model') dtr_model = DecisionTreeRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model') gbt_model = GBTRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model') rf_model = RandomForestRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model') #Generate prediction lr_pred = lr_model.transform(NoScale_Pca).select( 'prediction').collect()[0]['prediction'] dtr_pred = dtr_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] gbt_pred = gbt_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] rfr_pred = rf_model.transform(NoScale_Pca).select( 'prediction').collect()[0]['prediction'] #Prepare output df to output predictions output_df = pd.DataFrame() output_df['Algorithm'] = [ 'Linear Regression', 'Decision Tree', 'Gradient Boosted Tree', 'Random Forest'
def basicPredictionPipeline(data, col_target="", first_pred_day=False, dt_execution=False, jarra='quinto', logger=False, verbose=True, checks=False): try: start_all = datetime.now() # Get parameters from config file number_of_models = 6 parser = SafeConfigParser() parser.read(MODEL_CONFIG_FILE) local_save_path = parser.get('save_params', 'local_save_path') if not os.path.exists(local_save_path): os.makedirs(local_save_path) local_save_path = parser.get('save_params', 'local_save_path') # Define name of the variable for predictions cols_cyclical, cols_ohe_in, cols_features, col_target, cols_id = defineFeatures( model_complex='first', use_clustered_data_sets=False, col_target=col_target, verbose=False, logger=False) cols_ohe_out = [s + '_catVec' for s in cols_ohe_in] if first_pred_day is not None: split_value = first_pred_day else: split_value = datetime.today() first_pred_day = split_value.strftime('%Y-%m-%d') split_value = split_value.strftime('%Y-%m-%d') if not dt_execution: dt_execution = split_value s3_save_path = parser.get('save_params', 's3_save_path') s3_save_pipelines_path = s3_save_path + 'pipelines/' + col_target + '/dt-execution=' + dt_execution + '/' # Connect to spark session spark = createSparkSession(jarra='mass', verbose=True, logger=logger) # Load data prep and model pipelines from S3 for model training run on dt_execution: if verbose: logger.info( 'Loading data preparation and model pipelines lists from ' + s3_save_pipelines_path) pipelinePrepList = [] fitList = [] for i in range(number_of_models): pipelinePrepList.append( PipelineModel.read().load(s3_save_pipelines_path + "data_prep_pipeline" + str(i))) fitList.append(RandomForestRegressionModel.read().load( s3_save_pipelines_path + "model_pipeline" + str(i))) if verbose: logger.info( 'Loading data preparation and model pipelines lists end') # Add cyclical variables to features lists, OHE_out not as they are already in pipelines cols_cyclical_sin = [s + '_sin' for s in cols_cyclical] cols_cyclical_cos = [s + '_cos' for s in cols_cyclical] cols_cyclical_out = cols_cyclical_sin + cols_cyclical_cos for i in range(len(cols_features)): cols_features[i] = cols_features[i] + cols_cyclical_out # Create list with start and end dates for each of consecutive models start_days_list, end_days_list = createTestDatesListWFV( split_value, verbose=verbose, logger=logger) # Define date filters for test/pred sets of each consecutive models filterPredStartList = [] filterPredEndList = [] for i in range(len(start_days_list)): filterPredStartList.append( col('dt_flight_date_local') >= start_days_list[i]) filterPredEndList.append( col('dt_flight_date_local') <= end_days_list[i]) # Create list with test data sets for each of the consecutive models, each data set have different features # and dates, also data list for rows/flights with Nulls (e.g. no historical data) is created separately test_data_list, test_data_basic_list = createTestDataLists( data, cols_features, cols_ohe_in, col_target, cols_id, filterPredStartList, filterPredEndList, spark, verbose, logger) # Transform string idexer, ohe, vector assembler using pipeline from training if verbose: logger.info( 'String indexer, one hot encoder and vector assembler test sets, start' ) testDataList = [] testDataBasicList = [] for i in range(len(test_data_list)): if verbose: logger.info('Model ' + str(i)) testDataList.append(pipelinePrepList[i].transform( test_data_list[i])) if verbose: logger.info('RF Model start') # Apply RF model data using pipeline from training resultsList = [] resultsBasicList = [] for i in range(len(testDataList)): # Use the test set, is creating an extra column 'col_target' with the test fit results resultsList.append(fitList[i].transform( testDataList[i]).select(cols_id + [col_target + '_pred'])) if verbose: logger.info('RF Model end') # Union dataframes with results for each model as one dataframe (to get the full results) resultsFull = resultsList[0] resultsFull = resultsFull.union(resultsBasicList[0]) for i in range(1, len(test_data_list)): resultsFull = resultsFull.union(resultsList[i]) resultsFull = resultsFull.union(resultsBasicList[i]) resultsFull.cache() resultsFull = resultsFull.withColumn('dt_flight_date_local', to_date('dt_flight_date_local')) # Add execution date column resultsFull = resultsFull.withColumn('dt_execution', lit(first_pred_day)) resultsFull = resultsFull.withColumn('dt_execution', to_date('dt_execution')) # Save prediction results in local for each model seperately if verbose: logger.info('Changing data frame to Pandas to save in local') model_results = resultsFull.toPandas() if not os.path.isdir(local_save_path): os.mkdir(local_save_path) model_results\ .to_csv(local_save_path + col_target + '_results_' + first_pred_day.replace('-', '_') + '.csv', index=False) if verbose: logger.info('Results saved in: ' + local_save_path + col_target + '_results_' + first_pred_day.replace('-', '_') + '.csv') # Get feature importances featureImportancesFirst, featureImportancesLast, feature_importances_all = calcFeatImportance( fitList, testDataList, col_target, first_pred_day, verbose, logger) # Save feature importance for given target variable feature_importances_all.\ to_csv(local_save_path + col_target + '_feat_importance_' + first_pred_day.replace('-', '_') + '.csv', index=False) end_all = datetime.now() if verbose: logger.info('Random Forest, all models, time: ' + str(end_all - start_all)) logger.info('Feature importance saved in: ' + local_save_path + col_target + '_feat_importance_' + first_pred_day.replace('-', '_') + '.csv') logger.info( 'Check sum of predicted variables per month and count of flights each month: ' ) # Calculate metrics for mlflow if verbose and checks: df_prediction_errors, pred_errors = calcTrainingSetError( number_of_last_days_to_eval=90, last_dt_exec_to_evaluate=False, list_exec_dates_to_evalute=False, remove_outliers=True, verbose=True, logger=logger, checks=True) checkDuplicates = resultsFull.drop_duplicates(subset=['dt_flight_date_local', 'cd_num_flight', 'cd_airport_pair', 'cd_carrier'])\ .count() - resultsFull.count() resultsFullCount = resultsFull.count() # Count sum of rows in all test sets testSetCount = np.sum( [testDataList[i].count() for i in range(len(testDataList))]) testBasicSetCount = np.sum([ testDataBasicList[i].count() for i in range(len(testDataBasicList)) ]) logger.info('Sum of flights per month (real values): ') logger.info( resultsFull.groupBy("dt_flight_year_month").agg( count("cd_airport_pair")).sort( "dt_flight_year_month").toPandas()) logger.info('Sum of predicted ' + col_predict + ' per month (all flights): ') logger.info( resultsFull.groupBy("dt_flight_year_month").agg( sum(col_predict)).sort("dt_flight_year_month").toPandas()) logger.info('Number of duplicated flights: ') logger.info('Number of rows/flights in test sets: ' + str(testSetCount)) logger.info('Number of rows/flights in basic model test sets: ' + str(testBasicSetCount)) logger.info('Number of flights/rows in prediction set:') logger.info(resultsFullCount) logger.info( 'Feature importances for the first model (flights this week):') logger.info(featureImportancesFirst) logger.info('Feature importances for the last model:') logger.info(featureImportancesLast) mlflow_params = { 'checkDuplicates': checkDuplicates, 'resultsFullCount': resultsFullCount, 'testSetCount': testSetCount, 'testBasicSetCount': testBasicSetCount, 'predDateMin': str(resultsFull.toPandas().dt_flight_date_local.min()), 'predDateMax': str(resultsFull.toPandas().dt_flight_date_local.max()), 'time_seconds': (end_all - start_all).total_seconds() } else: mlflow_params = {} #spark.stop() #if verbose: # logger.info('Spark Session stopped') except Exception: logger.exception("Fatal error in demand_forecast_pred()") raise return (mlflow_params, pred_errors)
def prediction(self, predictiveData): '''creating duplicate dataset to avoid the datatype change of the original dataset ''' datasetAdd = predictiveData.get(PredictiveConstants.DATASETADD) spark = predictiveData.get(PredictiveConstants.SPARK) dataset = spark.read.parquet(datasetAdd) # adding extra index column in the dataset dataset = PredictiveUtilities.addInternalId(dataset) predictiveData.update({ PredictiveConstants.DATASET: dataset }) etlStats = PredictiveUtilities.performETL(etlInfo=predictiveData) dataset = etlStats.get(PredictiveConstants.DATASET) originalDataset = etlStats.get(PredictiveConstants.ORIGINALDATASET) algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME) modelStorageLocation = predictiveData.get(PredictiveConstants.MODELSTORAGELOCATION) modelName = predictiveData.get(PredictiveConstants.MODELSHEETNAME) datasetName = predictiveData.get(PredictiveConstants.DATASETNAME) spark = predictiveData.get(PredictiveConstants.SPARK) locationAddress = predictiveData.get(PredictiveConstants.LOCATIONADDRESS) if PredictiveConstants.LINEAR_REG.__eq__(algoName) or \ PredictiveConstants.RIDGE_REG.__eq__(algoName) or PredictiveConstants.LASSO_REG.__eq__(algoName): regressionPrediction = LinearRegressionModel.load(modelStorageLocation) if PredictiveConstants.RANDOMFORESTALGO.__eq__(algoName): regressionPrediction = RandomForestRegressionModel.load(modelStorageLocation) if PredictiveConstants.GRADIENTBOOSTALGO.__eq__(algoName): regressionPrediction = GBTRegressionModel.load(modelStorageLocation) dataset = dataset.drop(modelName) originalDataset = originalDataset.drop(modelName) dataset = regressionPrediction.transform(dataset) dataset = dataset.select(PredictiveConstants.DMXINDEX, modelName) finalDataset = originalDataset.join(dataset, on=[PredictiveConstants.DMXINDEX]) \ .sort(PredictiveConstants.DMXINDEX).drop(PredictiveConstants.DMXINDEX) # predictionData = predictionData.drop(featuresColm) # # #dropping extra added column # if indexedFeatures: # indexedFeatures.extend(oneHotEncodedFeaturesList) # predictionData = predictionData.drop(*indexedFeatures) # else: # predictionData = predictionData # overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' randomUUID = str(uuid.uuid4()) fileNameWithPathTemp = locationAddress + randomUUID + datasetName + "_temp.parquet" #correct the name. finalDataset.write.parquet(fileNameWithPathTemp, mode="overwrite") # send this path to java for deletion predictionDataReadAgain = spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=datasetName, locationAddress=locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_RandomForestReg' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict, 'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) dir_of_storeModel = train_result_dir + '/%s_model' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("RandomForestReg_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc = sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] Y_datavec = dataset[Y_names].values #分别获得字符字段和数值型字段数据,且合并 X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form( dataset, names_str, names_num, names_show, 'vocabset', 'open') #数据归一化 X_datavec = too.Data_process(X_datavec, normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X, Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num, ret = mlp.GS_PCA(X) print 'PCA Information:', pca_num, ret print '----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X, ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara, vocabset, ret_num) print '--------------Train data shape----------------' print 'X.shape:', X.shape print '----------------------------------------------' print 'Y.shape:', Y.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) targets = pd.DataFrame(Y, columns=['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis=1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula='Y ~ .', featuresCol="features", labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size], seed=666) #调用模型 clf_model = dmp.Distr_RandomForestRegressor(xy_train, xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print '----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration if options == 'predict': time_start = time() with open(dir_of_storePara, 'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec, datavec_show_list = too.Merge_form(dataset, names_str, names_num, names_show, vocabset, 'close') #数据归一化 X = too.Data_process(X_datavec, normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X, ret_num) print '-------------Pdedict data shape---------------' print 'X.shape:', X.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns, outputCol='features').transform(raw_features) clf_model = RandomForestRegressionModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration