def get_dataframe_loadStrategy(df_dict): dataframe_loadStatergy = df_dict['targetDataframeDetails'][ 'DFLoadStrategy'].upper() miscProcess.log_info( SCRIPT_NAME, "Dataframe loadStatergy: {}".format(dataframe_loadStatergy)) return table_loadStatergy
def sourceOccupancyReadParquet(occupancyFilePath, custom_schema, partition_value): miscProcess.log_info(SCRIPT_NAME, "Reading Occupancy CSV file...") print("Reading Occupancy CSV file") source_data_info = {} source_data_info["type"] = "CSV" #filepath = source_config['sources']['driverSource']["filePath"] print("Occupancy file path : {}".format(occupancyFilePath)) try: occupancy = spark.read.format("csv") \ .option("header", True) \ .schema(custom_schema) \ .load(occupancyFilePath) except Exception as e: miscProcess.log_info(SCRIPT_NAME, "error in reading csv: {}".format(e)) source_data_info["occupancyFilePath"] = occupancyFilePath source_data_info["partition"] = str(partition_value) occupancy.show(3) return (occupancy, source_data_info)
def partition_column(df_dict): part_col_lcase = df_dict['targetDataframeDetails']['dataframePartition'] miscProcess.log_info(SCRIPT_NAME, "Partition Column : {}".format(part_col_lcase)) return part_col_lcase
def executeHistoricOccupancyOperations(src_df, output, cols_list, partn_col, max_retry_count, retry_delay, custom_schema): PartitionColumn = partn_col station_id_lookup = createStationIDDF(custom_schema) occ_df = src_df\ .join(station_id_lookup, ['station_id'], how='left_outer')\ .select(src_df.OccupancyDateTime,src_df.Station_Id,\ src_df.Occupied_Spots,src_df.Available_Spots,\ station_id_lookup.Longitude,station_id_lookup.Latitude) spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") occ_df = occ_df.withColumn( 'occupancydatetime', timestamp_format(F.col('occupancydatetime'), "MM/dd/yyyy hh:mm:ss a")) occ_df = occ_df.withColumn(PartitionColumn, date_format(F.col('occupancydatetime'), "MMMM")) ReturnCode = 0 rec_cnt = 0 RetryCt = 0 Success = False while (RetryCt < max_retry_count) and not Success: try: Success = True occ_df.write.mode("append").partitionBy(PartitionColumn).parquet( output) except: Success = False RetryCt += 1 if RetryCt == max_retry_count: miscProcess.log_info( SCRIPT_NAME, "Failed on writing to Output after {} tries: {} ".format( max_retry_count, output)) ReturnCode = 2 return ReturnCode, rec_cnt else: miscProcess.log_info( SCRIPT_NAME, "Failed on writing to Output, re-try in {} seconds ". format(retry_delay)) time.sleep(retry_delay) miscProcess.log_print("Number of Records Processed: {}".format(rec_cnt)) return ReturnCode, rec_cnt
def build_dataframe_column_list(df_dict): column_list = [] column_count = len( df_dict['targetDataframeDetails']['dataframeColumnInfo']) for i in range(0, column_count): column_list.append(df_dict['targetDataframeDetails'] ['dataframeColumnInfo'][i]['columnName'].lower()) miscProcess.log_info(SCRIPT_NAME, "Dataframe Column List: {}".format(column_list)) return column_list
def parse_config(caller_function, filename, option_char='='): ReturnCode = 0 OPTION_CHAR = option_char options = {} param_list = caller_function + "\n" f = open(filename) for line in f: # Ignore Empty lines if not line.strip(): continue # First, remove comments: if COMMENT_CHAR in line: # if first char is '#' on the line, skip strip_line = line.strip() if strip_line[0] == '#': continue # split on comment char, keep on the part before line, comment = line.split(COMMENT_CHAR, 1) line += '\n' # Second, find lines with an option = value if OPTION_CHAR in line: param_list += '{}'.format(line) # spliy on option char option, value = line.split(OPTION_CHAR, 1) # strip spaces: option = option.strip() value = value.strip() value = remove_whitespace(value) options[option] = value else: miscProcess.log_error( SCRIPT_NAME, "ERROR: WRONG PARAMETER ASSIGNMENT ON LINE: {}".format( line.strip()), 1) ReturnCode = 1 break f.close() miscProcess.log_info(SCRIPT_NAME, param_list) return options, ReturnCode
def sourceBlockfaceReadParquet(blockfacefilePath, cust_schema): miscProcess.log_info(SCRIPT_NAME, "Reading CSV file...") print("Reading CSV file") source_data_info = {} source_data_info["type"] = "CSV" try: blockface = spark.read.format("csv") \ .option("header", True) \ .schema(cust_schema) \ .load(blockfacefilePath) except Exception as e: miscProcess.log_info(SCRIPT_NAME, "error in reading csv: {}".format(e)) source_data_info["blockfacefilePath"] = blockfacefilePath return (blockface, source_data_info)
def executeOccupancyOperations(src_df, output, datedimoutputpath, cols_list, partn_col, max_retry_count, retry_delay): PartitionColumn = partn_col ReturnCode = 0 rec_cnt = 0 RetryCt = 0 Success = False while (RetryCt < max_retry_count) and not Success: try: Success = True # reading from DBFS input_df = src_df except: Success = False RetryCt += 1 if RetryCt == max_retry_count: miscProcess.log_info( SCRIPT_NAME, "Failed on reading input file after {} tries: {}".format( max_retry_count)) ReturnCode = 1 return ReturnCode, rec_cnt else: miscProcess.log_info( SCRIPT_NAME, "Failed on reading input file, re-try in {} seconds ". format(retry_delay)) select_df = input_df.select( [colname for colname in input_df.columns if colname in (cols_list)]) print("Reading inside transformation function") select_df.show(5) for column in cols_list: if column == 'station_id': print("Reading inside column transformations of {}".format(column)) select_df = select_df.withColumn( column, remove_non_word_characters(F.col("station_id"))) select_df = select_df.withColumn( column, select_df[column].cast(IntegerType())) elif column == 'occupancydatetime': spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") select_df = select_df.withColumn( column, timestamp_format(F.col(column), "MM/dd/yyyy hh:mm:ss a")) select_df = select_df.withColumn( PartitionColumn, date_format(F.col(column), "MMMM")) date_dim = select_df.withColumn('day_of_week',date_format(F.col(column), "EEEE")) \ .withColumn('month',date_format(F.col(column), "MMMM")) date_dim = date_dim.select('occupancydatetime', 'day_of_week', 'month') select_df = select_df.withColumn( PartitionColumn, date_format(F.col(column), "MMMM")) elif column == 'location': split_col = ['longitude', 'latitude'] select_df=select_df.withColumn(split_col[0],F.split(column,' ').getItem(1)) \ .withColumn(split_col[1],F.split(column,' ').getItem(2)) select_df=select_df.withColumn(split_col[0],remove__parenthesis(col(split_col[0]))) \ .withColumn(split_col[1],remove__parenthesis(col(split_col[1]))) select_df = select_df.withColumn(split_col[0],select_df[split_col[0]].cast(DoubleType())) \ .withColumn(split_col[1],select_df[split_col[1]].cast(DoubleType())) select_df = select_df.drop(column) # select_df = select_df.select(cols_list) #select_df = select_df.select([colname for colname in input_df.columns if colname in (cols_list)]) RetryCt = 0 Success = False while (RetryCt < max_retry_count) and not Success: try: Success = True select_df.show(3) miscProcess.log_print( "Writing occupancy dataframe to output file: {}".format( output)) select_df.write.mode("append").partitionBy( PartitionColumn).parquet(output) miscProcess.log_print( "Writing date dimension to output file: {}".format( datedimoutputpath)) date_dim.show(3) date_dim.write.mode("append").partitionBy( PartitionColumn).parquet(datedimoutputpath) except: Success = False RetryCt += 1 if RetryCt == max_retry_count: miscProcess.log_info( SCRIPT_NAME, "Failed on writing to Output after {} tries: {} ". format(max_retry_count, output)) ReturnCode = 2 return ReturnCode, rec_cnt else: miscProcess.log_info( SCRIPT_NAME, "Failed on writing to Output, re-try in {} seconds ". format(retry_delay)) time.sleep(retry_delay) miscProcess.log_print( "Number of Records Processed: {}".format(rec_cnt)) return ReturnCode, rec_cnt
def executeBlockfaceOperations(src_df, output, cols_list, max_retry_count, retry_delay): miscProcess.log_print("Starting the Blockface Execute Operations") src_df.printSchema() ReturnCode = 0 rec_cnt = 0 RetryCt = 0 Success = False while (RetryCt < max_retry_count) and not Success: try: Success = True input_df = src_df except: Success = False RetryCt += 1 if RetryCt == max_retry_count: miscProcess.log_info( SCRIPT_NAME, "Failed on reading input file after {} tries: {}".format( max_retry_count)) ReturnCode = 1 return ReturnCode, rec_cnt else: miscProcess.log_info( SCRIPT_NAME, "Failed on reading input file, re-try in {} seconds ". format(retry_delay)) select_df = input_df.select( [colname for colname in input_df.columns if colname in (cols_list)]) select_df=select_df.withColumn('wkd_start1',format_minstoHHMMSS('wkd_start1')) \ .withColumn('wkd_end1',format_minstoHHMMSS('wkd_end1')) \ .withColumn('wkd_start2',format_minstoHHMMSS('wkd_start2')) \ .withColumn('wkd_end2',format_minstoHHMMSS('wkd_end2')) \ .withColumn('wkd_end3',format_minstoHHMMSS('wkd_end3')) \ .withColumn('sat_start1',format_minstoHHMMSS('sat_start1')) \ .withColumn('sat_end1',format_minstoHHMMSS('sat_end1')) \ .withColumn('sat_start2',format_minstoHHMMSS('sat_start2')) \ .withColumn('sat_end2',format_minstoHHMMSS('sat_end2')) \ .withColumn('sat_start3',format_minstoHHMMSS('sat_start3')) \ .withColumn('sat_end3',format_minstoHHMMSS('sat_end3')) #miscProcess.log_print("Writing to output file: {}".format(output)) select_df = select_df.select( [colname for colname in input_df.columns if colname in (cols_list)]) RetryCt = 0 Success = False while (RetryCt < max_retry_count) and not Success: try: Success = True miscProcess.log_info(SCRIPT_NAME, "Writing to Parquet file") select_df.show(3) print("Output file {}".format(output)) select_df.coalesce(1).write.mode("overwrite").parquet( output + "//Blockface.parquet") except: Success = False RetryCt += 1 if RetryCt == max_retry_count: miscProcess.log_info( SCRIPT_NAME, "Failed on writing File after {} tries: {} ".format( max_retry_count, output)) ReturnCode = 2 return ReturnCode, rec_cnt else: miscProcess.log_info( SCRIPT_NAME, "Failed on writing File, re-try in {} seconds ".format( retry_delay)) time.sleep(retry_delay) miscProcess.log_print("Number of Records Processed: {}".format(rec_cnt)) return ReturnCode, rec_cnt
miscProcess.log_step(SCRIPT_NAME, "PERFORMING STEP {}:{} ".format(STEP, STEP_DESC)) if SparkSubmitClientMode == 'Y': # Spark Submitted in Client Mode job_control_file = ControlPath + JOBNAME +".cfg" blockface_config_filename = ConfigPath +BlockfaceDataframeName.lower()+'.json' occupancy_config_filename = ConfigPath +OccupancyDataframeName.lower()+'.json' else: # Spark Submitted in Cluster Mode job_control_file = './' + JOBNAME +".cfg" blockface_config_filename = './common/' +BlockfaceDataframeName.lower()+'.json' occupancy_config_filename = './common/' +OccupancyDataframeName.lower()+'.json' if os.path.isfile(job_control_file): miscProcess.log_info(SCRIPT_NAME, "Job control filename: {} exist".format(job_control_file)) paramFile, ReturnCode = readEnvironmentParameters.read_job_control(job_control_file) if ReturnCode !=0: miscProcess.log_error(SCRIPT_NAME, "Error : Reading Job Control file {} ".format(job_control_file),ReturnCode) exit(STEP) globals().update(paramFile) else: miscProcess.log_error(SCRIPT_NAME, "Job control filename: {} doesn't exist ".format(job_control_file), STEP) exit(STEP) #==============================================================================================================# (STEP, STEP_DESC)=(20, "Validate All Needed Parameters defined from the control files") #===============================================================================================================# # ALWAYS PERFORM THIS STEP
def get_source_dateDimOutputPath(df_dict): datedimOutputPath = df_dict['sources']['driverSource']["DimOutputPath"] miscProcess.log_info( SCRIPT_NAME, "Date Dim OutputPathFilePath: {}".format(datedimOutputPath)) return datedimOutputPath
def get_source_OutputPath(df_dict): outputFilePath = df_dict['sources']['driverSource']["OutputPath"] miscProcess.log_info(SCRIPT_NAME, "OutputPathFilePath: {}".format(outputFilePath)) return outputFilePath
def get_source_driverFilerPath(df_dict): driverFilePath = df_dict['sources']['driverSource']["filePath"] miscProcess.log_info(SCRIPT_NAME, "driverFilePath: {}".format(driverFilePath)) return driverFilePath