def analyze(spark: SparkSession, file='/ships_data/PortSubset-small.csv', output_file='ais_data.parquet'): if not output_file: # No path, use tmp output_file = tempfile.mkdtemp() + '/ais_data.parquet' prt_high(""" Running csv ingestion ################################## Parameters - Input file: {} - Output file: {} ################################## """.format(file, output_file)) ships_table = spark.read.format('com.databricks.spark.csv')\ .options(header='true', inferschema='false', delimiter=';').load(file) ships_table = clean_data(ships_table) ships_table = cast_data(ships_table) ships_table = ships_table.na.drop() ships_table = ensure_columns_type(ships_table) # New features ships_table = ships_table.withColumn('ais_navstatus', (when( col("sog") < 1, 'HOT').when(col("sog") <= 5, 'MAN').otherwise('CRU'))) ships_table.write.mode('overwrite').parquet(output_file) # If we have mlflow, log the result if 'mlflow' in sys.modules: prt_high("Logging MLFlow artifacts") mlflow.log_artifacts(output_file, "ais_data.parquet") return
def main(args): prt_info("Going to run Spark Job") # Change path to the current file's path abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) # environment = { # 'PYSPARK_JOB_ARGS': ' '.join(args.job_args) if args.job_args else '' # } job_args = dict() if args.job_args: job_args_tuples = [arg_str.split('=') for arg_str in args.job_args] prt_info('job_args_tuples: %s' % job_args_tuples) job_args = {a[0]: a[1] for a in job_args_tuples} prt_info('\nRunning job %s...\nenvironment is %s\n' % (args.job_name, str(job_args))) # Start Spark spark = SparkSession.builder\ .appName(args.job_name)\ .config("spark.jars", args.extra_jars) if args.hdfs: spark = spark.config("spark.hadoop.fs.defaultFS", args.hdfs) spark = spark.getOrCreate() # Set timezone to UTC spark.conf.set("spark.sql.session.timeZone", "UTC") prt_warn("-> For some reason we get the spark SQLContext deprecated\ warning, but we're already using SparkSession. Ignore.") prt_info("Spark context started. Going to import jobs.") prt_info("Setting log level to {}".format(args.log_level)) spark.sparkContext.setLogLevel(args.log_level) # Import Module try: job_module = importlib.import_module('jobs.%s' % args.job_name) except Exception as e: prt_err(str(e)) prt_err("Error, couldnt load module %s" % args.job_name) exit(1) # Execute Module start = time.time() job_module.analyze(spark, **job_args) end = time.time() prt_high("\nExecution of job %s took %s seconds" % (args.job_name, end-start))
def log_emission_summary(emis): # Add ME and AE emis = emis\ .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\ .withColumn('nox', col('nox_me') + col('nox_ae'))\ .withColumn('sox', col('sox_me') + col('sox_ae'))\ .withColumn('co2', col('co2_me') + col('co2_ae')) # Log total all = emis.agg( sum(col('trans_p')).alias('total_trans_p'), sum(col('trans_p_me')).alias('total_trans_p_me'), sum(col('trans_p_ae')).alias('total_trans_p_ae'), sum(col('nox')).alias('total_nox'), sum(col('sox')).alias('total_sox'), sum(col('co2')).alias('total_co2'), sum(col('nox_me')).alias('total_nox_me'), sum(col('sox_me')).alias('total_sox_me'), sum(col('co2_ae')).alias('total_co2_me'), sum(col('nox_ae')).alias('total_nox_ae'), sum(col('sox_ae')).alias('total_sox_ae'), sum(col('co2_ae')).alias('total_co2_ae')).toPandas() mlflow.log_metrics(all.iloc[0].to_dict()) # Generate time features emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType())) emis = emis\ .withColumn('day', dayofyear(col('time')))\ .withColumn('week', weekofyear(col('time')))\ .withColumn('month', month(col('time')))\ .withColumn('dayofweek', dayofweek(col('time'))).cache() day_df = group_emis(emis, 'day') week_df = group_emis(emis, 'week') month_df = group_emis(emis, 'month') dayofweek_df = group_emis(emis, 'dayofweek') prt_high("Generated summary. Logging it.") # Log everything log_dataframe_metric(day_df, 'day_day') log_dataframe_metric(week_df, 'week_week') log_dataframe_metric(month_df, 'month_month') log_dataframe_metric(dayofweek_df, 'dayofweek_dayofweek')
def analyze(spark: SparkSession, input_file='emissions.parquet', hdfs_path='hdfs://', plot_path='../output'): prt_high( """ Running Summarize Emissions ################################## Parameters - Input file: {} - Output HDFS path: {} - Output plot path: {} ################################## """.format(input_file, hdfs_path, plot_path) ) # import os # os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64" emis = spark.read.parquet(input_file) #log_emission_summary(emis) log_emission_summary_csv(emis, hdfs_path, plot_path) return
def analyze(spark: SparkSession, file='/ships_data/IHSData.txt', output_file='ihs_metadata.parquet'): if not output_file: # No path, use tmp output_file = tempfile.mkdtemp() + '/ihs_metadata.parquet' prt_high( """ Running metadata ingestion ################################## Parameters - Input file: {} - Output file: {} ################################## """.format(file, output_file)) ihs_table = spark.read.csv(file, header=True, schema=schema, sep=',', nullValue='NA') ihs_table = clean_data(ihs_table) ihs_table.write.mode('overwrite').parquet(output_file) if 'mlflow' in sys.modules: prt_high("Logging MLFlow artifacts") mlflow.log_artifacts(output_file, "ihs_data.parquet") return
def analyze(spark: SparkSession, input_file='emissions.parquet', output_file='emissions.csv'): prt_high(""" Running export to CSV ################################## Parameters - Input file: {} - Output file: {} ################################## """.format(input_file, output_file)) df = spark.read.parquet(input_file) df.orderBy('time', 'imo')\ .coalesce(1)\ .write.csv(output_file, header=True, mode='overwrite') # Coalesce(1) so that we write one big csv. # Pandas version # pdf = df.orderBy('time', 'imo').toPandas() # pdf.to_csv(output_file) return
def analyze(spark: SparkSession, input_file='/ships_data/IHSData.txt', hermes_file='/ships_data/day_summary.csv', model='', output_file='hermes_comparison.csv'): if not output_file: # No path, use tmp output_file = tempfile.mkdtemp() + '/hermes_comparison.csv' prt_high( """ Running comparison with HERMES ################################## Parameters - Input file: {} - HERMES file: {} - Model: {} - Output file: {} ################################## """.format(input_file, hermes_file, model, output_file)) emis = spark.read.parquet(input_file) summ = emis_summary(emis) hermes = process_hermes(hermes_file) diff = emis_diff(summ, hermes, model) diff.to_csv(output_file) if 'mlflow' in sys.modules: prt_high("Logging MLFlow artifacts") mlflow.log_artifact(output_file, "hermes_comparison.csv") log_diff_by_type(diff) pols = ["NOx", "SOx", "CO2"] models = ["hermes", model] piv = pivot_diff_df(diff, models, pols) types = diff.type.unique() pol_barplot_by_type(piv, types, pols) mlflow.log_artifact("barplot") return
def analyze(spark: SparkSession, input_file='rasters.parquet', output_file='/home/rasters.hdf5'): prt_high(""" ################################## Parameters - Input file: {} - Output file: {} ################################## """.format(input_file, output_file)) df = spark.read.parquet(input_file) # Metadata processing metadf = spark.read.parquet(input_file + ".meta").collect()[0] num_vars = metadf['num_vars'] num_cols = metadf['num_cols'] num_rows = metadf['num_rows'] # 'last_amp_v', 'last_cos', 'last_sin')\ by_time = df.select('hour', 'cell', 'sample_count', 'nox_me', 'nox_ae')\ .rdd.map((lambda row: (row['hour'], row))).groupByKey()\ .sortByKey(ascending=True).cache() n_rasters = by_time.count() filename = output_file f = h5py.File(filename, 'w', libver='latest') shape = (n_rasters, num_rows, num_cols, num_vars) dset = f.create_dataset('dataset', shape, dtype='f', compression="gzip", chunks=(1, num_rows, num_cols, num_vars)) for i, raster_tuple in enumerate(by_time.toLocalIterator()): dset[i] = to_raster(raster_tuple[1], num_rows, num_cols, num_vars) # Global aggregates # Aggregates by cell ds_min = np.min(dset, axis=(0)) prt_info('min:', ds_min) ds_max = np.max(dset, axis=(0)) prt_info('max:', ds_max) ds_mean = np.mean(dset, axis=(0)) prt_info('mean:', ds_mean) ds_std = np.std(dset, axis=(0)) prt_info('std:', ds_std) f.create_dataset('min', data=ds_min) f.create_dataset('max', data=ds_max) f.create_dataset('mean', data=ds_mean) f.create_dataset('std', data=ds_std) prt_info("Cell metadata shape:") prt_info(f['min'].shape) prt_info(f['max'].shape) prt_info(f['mean'].shape) prt_info(f['std'].shape) # Global aggregates ds_min = np.min(ds_min, axis=(0, 1)) # We get min and max from the previous result! prt_info('min:', ds_min) ds_max = np.max(ds_max, axis=(0, 1)) prt_info('max:', ds_max) ds_mean = np.mean(dset, axis=(0, 1, 2)) prt_info('mean:', ds_mean) ds_std = np.std(dset, axis=(0, 1, 2)) prt_info('std:', ds_std) dset.attrs['min'] = ds_min dset.attrs['max'] = ds_max dset.attrs['mean'] = ds_mean dset.attrs['std'] = ds_std f.close()
def analyze(spark: SparkSession, input_data='ships_data.parquet', input_metadata='ships_metadata.parquet', output_file='emissions.parquet', step=60, interpolation_lim=15*60, unit="kg", sfoc="NAEI", model="STEAM", ae_on_lim=24*60*60): if not output_file: # No path, use tmp output_file = tempfile.mkdtemp() + '/emissions.parquet' prt_high(""" Running compute emissions ################################## Parameters - Input data: {} - Input metadata (IHS): {} - Output file: {} - Interpolation limit: {} (s) - Interpolation step: {} (s) - Aux. Eng. at berth limit: {} (s) - Unit: {}/{}s - SFOC: {} - Emission Model: {} ################################## """.format(input_data, input_metadata, output_file, interpolation_lim, step, ae_on_lim, unit, step, sfoc, model)) # Rename stage if 'mlflow' in sys.modules: mlflow.set_tag( "mlflow.runName", "compute_emissions_{}_{}".format(model, sfoc)) # Cast parameters interpolation_lim = int(interpolation_lim) ae_on_lim = int(ae_on_lim) step = int(step) if unit == "kg": prt_info("Setting unit to kilograms") unit = 1000 else: prt_info("Setting unit to grams") unit = 1 df = spark.read.parquet(input_data) ihs_df = spark.read.parquet(input_metadata) # IHS processing # Filter desired SFOC if sfoc == "NAEI": print("Using NAEI SFOC estimation") ihs_df = ihs_df.withColumnRenamed("naei_sfoc_me", "sfoc_me")\ .withColumnRenamed("naei_sfoc_ae", "sfoc_ae") else: # Other is STEAM by default print("Using STEAM SFOC estimation") ihs_df = ihs_df.withColumnRenamed("steam_sfocbase_me", "sfoc_me")\ .withColumnRenamed("steam_sfocbase_ae", "sfoc_ae") # Variable filtering and function preparation if model == "STEAM2": transientPowerMEFunc = udf(transient_power_me_steam2, FloatType()) transientPowerAEFunc = udf(transient_power_ae_steam2, FloatType()) ihs_df = ihs_df.select('imo', 'l', 'b', 't', 'qpc', 'wet_surf_k', 'wet_surf_a3', 'cr_nofn', 'n_screw', 'n_cabin', 'n_ref_teu', 'design_draft', 'waterline', 'type', 'hermes_type', 'me_rpm', 'ae_rpm', 'inst_pow_me', 'inst_pow_ae', 'design_speed', 'sfoc_me', 'sfoc_ae' ).cache() # Which model is used? If there any nulls on the selected attrs # we use STEAM (except for inst_pow) ihs_df = count_nulls_steam2(ihs_df)\ .withColumn("model", (col("nulls") == 0).cast('integer')+1) else: # Default is STEAM transientPowerMEFunc = udf(transient_power_me_steam, FloatType()) transientPowerAEFunc = udf(transient_power_ae_steam, FloatType()) ihs_df = ihs_df.select('imo', 'type', 'hermes_type', 'me_rpm', 'ae_rpm', 'inst_pow_me', 'inst_pow_ae', 'design_speed', 'sfoc_me', 'sfoc_ae' ).cache() # Dataset joining and SFOC selection joined = df.select('nombre', 'imo', 'sog', 'latitude', 'longitude', 'time')\ .join(ihs_df, ['imo'], 'inner') # Interpolation grouped = joined.rdd.groupBy(lambda record: record['imo']) interpolated = grouped.flatMap( lambda d: transform_grouped(d, step, interpolation_lim)) # new_df = interpolated.toDF() if model == "STEAM2": interp_schema = StructType([ StructField('imo', LongType(), True), StructField('nombre', StringType(), True), StructField('sog', DoubleType(), True), StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('time', LongType(), True), StructField('l', DoubleType(), True), StructField('b', DoubleType(), True), StructField('t', DoubleType(), True), StructField('qpc', DoubleType(), True), StructField('wet_surf_k', DoubleType(), True), StructField('wet_surf_a3', DoubleType(), True), StructField('cr_nofn', DoubleType(), True), StructField('n_screw', LongType(), True), StructField('n_cabin', LongType(), True), StructField('n_ref_teu', LongType(), True), StructField('design_draft', BooleanType(), True), StructField('waterline', DoubleType(), True), StructField('type', StringType(), True), StructField('hermes_type', StringType(), True), StructField('me_rpm', LongType(), True), StructField('ae_rpm', LongType(), True), StructField('inst_pow_me', DoubleType(), True), StructField('inst_pow_ae', DoubleType(), True), StructField('design_speed', DoubleType(), True), StructField('sfoc_me', LongType(), True), StructField('sfoc_ae', LongType(), True), StructField('nulls', LongType(), True), StructField('model', LongType(), True), StructField('last_move', LongType(), True), StructField('d_lat', DoubleType(), True), StructField('d_lon', DoubleType(), True), StructField('amp_v', DoubleType(), True) ]) else: # Steam 1 interp_schema = StructType([ StructField('imo', LongType(), True), StructField('nombre', StringType(), True), StructField('sog', DoubleType(), True), StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('time', LongType(), True), StructField('type', StringType(), True), StructField('hermes_type', StringType(), True), StructField('me_rpm', LongType(), True), StructField('ae_rpm', LongType(), True), StructField('inst_pow_me', DoubleType(), True), StructField('inst_pow_ae', DoubleType(), True), StructField('design_speed', DoubleType(), True), StructField('sfoc_me', LongType(), True), StructField('sfoc_ae', LongType(), True), StructField('last_move', LongType(), True), StructField('d_lat', DoubleType(), True), StructField('d_lon', DoubleType(), True), StructField('amp_v', DoubleType(), True) ]) new_df = spark.createDataFrame(data=interpolated, schema=interp_schema) # Setting the schema for the new data # new_df = change_column_type(new_df, 'time', IntegerType(), True) # new_df = change_column_type(new_df, 'latitude', FloatType(), True) # new_df = change_column_type(new_df, 'longitude', FloatType(), True) # new_df = change_column_type(new_df, 'sog', FloatType(), True) # new_df = change_column_type(new_df, 'imo', IntegerType(), True) # new_df = change_column_type(new_df, 'd_lat', FloatType(), True) # new_df = change_column_type(new_df, 'd_lon', FloatType(), True) # new_df = change_column_type(new_df, 'amp_v', FloatType(), True) if model == "STEAM2": # Transient power calculation new_df = new_df.withColumn( 'trans_p_me', transientPowerMEFunc( new_df['model'], new_df['sog'], new_df['design_speed'], new_df['inst_pow_me'], new_df['l'], new_df['b'], new_df['t'], new_df['qpc'], new_df['wet_surf_k'], new_df['wet_surf_a3'], new_df['cr_nofn'], new_df['n_screw'], new_df['design_draft'], new_df['waterline']) ) new_df = new_df.withColumn( 'trans_p_ae', transientPowerAEFunc( new_df['sog'], new_df['type'], new_df['inst_pow_ae'], new_df['n_cabin'], new_df['n_ref_teu']) ) else: # Transient power calculation new_df = new_df.withColumn( 'trans_p_me', transientPowerMEFunc( new_df['sog'], new_df['design_speed'], new_df['inst_pow_me']) ) new_df = new_df.withColumn( 'trans_p_ae', transientPowerAEFunc( new_df['sog'], new_df['type'], new_df['inst_pow_ae']) ) # Deactivate AE if the ship has been at berth more than 24h if ae_on_lim > 0: new_df = new_df.withColumn( "trans_p_ae", when(col('last_move') < ae_on_lim, col("trans_p_ae")) .otherwise(0)) calcSOxEmissionFactorFunc = udf(calcSOxEmissionFactor, FloatType()) calcCO2EmissionFactorFunc = udf(calcCO2EmissionFactor, FloatType()) calcNOxEmissionFactorFunc = udf(calcNOxEmissionFactor, FloatType()) # TODO: Maybe this shouldn't be a UDF estimateEmissionFunc = udf( lambda fact, pow: estimateEmission(fact, pow, step, unit), FloatType()) # Emission factor calculation # TODO: Move this to R script new_df = new_df.withColumn( 'sox_fact_me', calcSOxEmissionFactorFunc(new_df['sfoc_me'])) new_df = new_df.withColumn( 'sox_fact_ae', calcSOxEmissionFactorFunc(new_df['sfoc_ae'])) new_df = new_df.withColumn( 'co2_fact_me', calcCO2EmissionFactorFunc(new_df['sfoc_me'])) new_df = new_df.withColumn( 'co2_fact_ae', calcCO2EmissionFactorFunc(new_df['sfoc_ae'])) new_df = new_df.withColumn( 'nox_fact_me', calcNOxEmissionFactorFunc(new_df['me_rpm'])) new_df = new_df.withColumn( 'nox_fact_ae', calcNOxEmissionFactorFunc(new_df['ae_rpm'])) # Emission calculation new_df = new_df.withColumn( 'sox_me', estimateEmissionFunc( new_df['sox_fact_me'], new_df['trans_p_me'])) new_df = new_df.withColumn( 'sox_ae', estimateEmissionFunc( new_df['sox_fact_ae'], new_df['trans_p_ae'])) new_df = new_df.withColumn( 'co2_me', estimateEmissionFunc( new_df['co2_fact_me'], new_df['trans_p_me'])) new_df = new_df.withColumn( 'co2_ae', estimateEmissionFunc( new_df['co2_fact_ae'], new_df['trans_p_ae'])) new_df = new_df.withColumn( 'nox_me', estimateEmissionFunc( new_df['nox_fact_me'], new_df['trans_p_me'])) new_df = new_df.withColumn( 'nox_ae', estimateEmissionFunc( new_df['nox_fact_ae'], new_df['trans_p_ae'])) new_df = ensure_columns_type(new_df) new_df.write.mode('overwrite').parquet(output_file) if 'mlflow' in sys.modules: prt_high("Logging MLFlow artifacts") prt_high("- emissions.parquet") mlflow.log_artifacts(output_file, "emissions.parquet") prt_high("- emissions summary") log_emission_summary(new_df) return
def log_emission_summary_csv(emis, hdfs_path, plot_path): # Add ME and AE emis = emis\ .withColumn('trans_p', col('trans_p_me') + col('trans_p_ae'))\ .withColumn('nox', col('nox_me') + col('nox_ae'))\ .withColumn('sox', col('sox_me') + col('sox_ae'))\ .withColumn('co2', col('co2_me') + col('co2_ae')) # Log total all = emis.agg( sum(col('trans_p')).alias('total_trans_p'), sum(col('trans_p_me')).alias('total_trans_p_me'), sum(col('trans_p_ae')).alias('total_trans_p_ae'), sum(col('nox')).alias('total_nox'), sum(col('sox')).alias('total_sox'), sum(col('co2')).alias('total_co2'), sum(col('nox_me')).alias('total_nox_me'), sum(col('sox_me')).alias('total_sox_me'), sum(col('co2_ae')).alias('total_co2_me'), sum(col('nox_ae')).alias('total_nox_ae'), sum(col('sox_ae')).alias('total_sox_ae'), sum(col('co2_ae')).alias('total_co2_ae')) # Generate time features emis = emis.withColumn('time', emis.time.cast(dataType=TimestampType())) emis = emis\ .withColumn('day', dayofyear(col('time')))\ .withColumn('week', weekofyear(col('time')))\ .withColumn('month', month(col('time')))\ .withColumn('dayofweek', dayofweek(col('time'))).cache() day_df = group_emis(emis, 'day') week_df = group_emis(emis, 'week') month_df = group_emis(emis, 'month') dayofweek_df = group_emis(emis, 'dayofweek') prt_high("Generated summary. Logging it.") def save_csv(df, path): df\ .coalesce(1)\ .write.csv(path, header=True, mode='overwrite') # Saving CSVs in HDFS - coalesce(1) makes the csv to be saved as one file save_csv(all, hdfs_path+"/emis.csv") save_csv(day_df, hdfs_path+"/day.csv") save_csv(week_df, hdfs_path+"/week.csv") save_csv(month_df, hdfs_path+"/month.csv") save_csv(dayofweek_df, hdfs_path+"/dayofweek.csv") # Make directories mkdir_if_not_exist(plot_path) mkdir_if_not_exist(plot_path+"/plot") mkdir_if_not_exist(plot_path+"/plot/day") mkdir_if_not_exist(plot_path+"/plot/week") mkdir_if_not_exist(plot_path+"/plot/month") mkdir_if_not_exist(plot_path+"/plot/dayofweek") # Save plots in the container plot_summary(day_df.toPandas(), plot_path+"/plot/day", x="day_day") plot_summary(week_df.toPandas(), plot_path+"/plot/week", x="week_week") plot_summary(month_df.toPandas(), plot_path+"/plot/month", x="month_month") plot_summary( dayofweek_df.toPandas(), plot_path+"/plot/dayofweek", x="dayofweek_dayofweek")
def analyze(spark: SparkSession, input_file='emissions.parquet', output_file='rasters.parquet', time_granularity=600, num_cols=100, cell_size=None, use_type=False): prt_high(""" Running compute emissions. ################################## Parameters - Input file: {} - Output file: {} - Time granularity: {} - Cell size: {} - Number of raster columns: {} - Rasters by type: {} ################################## """.format(input_file, output_file, time_granularity, cell_size, num_cols, use_type)) cell_size_meters = None # Process parameters if cell_size is not None: prt_high("Info: Cell size set in parameters, using it") if cell_size[-1] == 'm': # Meters cell_size_meters = int(cell_size[:-1]) cell_size = meters_to_deg(cell_size_meters) # All except the last char else: cell_size = int(cell_size) else: num_cols = int(num_cols) time_granularity = int(time_granularity) # TODO: FILL NUM_VARS automatically prt_warn("WARNING: Number of variables is manually set to 10") num_vars = 10 df = spark.read.parquet(input_file) min_max_lat_lon = df.agg(F.min(df.latitude), F.max(df.latitude), F.min(df.longitude), F.max(df.longitude)) min_max_time = df.agg(F.min(df.time), F.max(df.time)) min_max_row = min_max_lat_lon.first() min_lat = min_max_row[0] max_lat = min_max_row[1] min_lon = min_max_row[2] max_lon = min_max_row[3] min_max_time = min_max_time.first() min_time = min_max_time[0] max_time = min_max_time[1] if cell_size is None: # Calculate cell dimension and number of rows using the number of # columns defined cell_size = (max_lon - min_lon) / num_cols else: num_cols = int(np.ceil((max_lon - min_lon) / cell_size)) num_rows = int(np.ceil((max_lat - min_lat) / cell_size)) prt_info("NUM COLS: " + str(num_cols)) prt_info("NUM ROWS: " + str(num_rows)) prt_info("CELL DIM: " + str(cell_size)) # Create metadata file prt_info("Building metadata") meta = [(num_cols, num_rows, num_vars, min_lat, max_lat, min_lon, max_lon, cell_size, cell_size_meters, min_time, max_time, time_granularity) ] rdd = spark.sparkContext.parallelize(meta) metarow = rdd.map(lambda x: Row(num_cols=int(x[0]), num_rows=int(x[1]), num_vars=int(x[2]), min_lat=float(x[3]), max_lat=float(x[4]), min_lon=float(x[5]), max_lon=float(x[6]), cell_size=float(x[7]), cell_size_meters=float(x[8]), min_time=int(x[9]), max_time=int(x[10]), time_granularity=int(x[11]))) metadf = spark.createDataFrame(metarow) # Create rasters prt_info("Building rasters") # TODO: Implement a way to define cell size to_cell = partial(lat_lon_to_cell, min_lon, min_lat, num_cols, num_rows, cell_size) convertToCellFunc = udf(to_cell, IntegerType()) to_hours = partial(to_time_resolution, time_granularity) convertToHours = udf(to_hours, IntegerType()) df_cell = df.withColumn('cell', convertToCellFunc(df['longitude'], df['latitude'])) df_cell = df_cell.withColumn('hour', convertToHours(df['time'])) # use desc order and first to get the last value # https://stackoverflow.com/questions/43114445/how-to-use-first-and-last-function-in-pyspark w = Window().partitionBy('imo', 'hour').orderBy(df_cell.time.desc()) # lower than any possible sin/cos value so max only gets valid values df_cell = df_cell.withColumn('last_time', first("time").over(w)) # PLACEHOLDER = -9999.0 # df_cell = df_cell.withColumn('last_amp_v', # when(df_cell['last_time'] == df_cell['time'], # df_cell['amp_v']).otherwise(PLACEHOLDER)) # df_cell = df_cell.withColumn('last_cos', # when(df_cell['last_time'] == df_cell['time'], # df_cell['cos_v']).otherwise(PLACEHOLDER)) # df_cell = df_cell.withColumn('last_sin', # when(df_cell['last_time'] == df_cell['time'], # df_cell['sin_v']).otherwise(PLACEHOLDER)) # potential bug: max(last_amp_v), max('last_cos'), max('last_sin'). Each # may be from different ships if they happen to be in the same raster cell if use_type: raster = df_cell.groupBy("cell", "hour", "type") else: raster = df_cell.groupBy("cell", "hour") raster = raster.agg( F.sum("sox_me").alias("sox_me"), F.sum("sox_ae").alias("sox_ae"), F.sum('co2_me').alias('co2_me'), F.sum("co2_ae").alias("co2_ae"), F.sum("nox_me").alias("nox_me"), F.sum('nox_ae').alias('nox_ae'), # F.max('last_amp_v').alias('last_amp_v'), # F.max('last_cos').alias('last_cos'), # F.max('last_sin').alias('last_sin'), F.count('*').alias('sample_count')) # Lower limit for these attributes # raster = raster.withColumn( # 'last_amp_v', when(raster['last_amp_v'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_amp_v'])) # raster = raster.withColumn( # 'last_cos', when(raster['last_cos'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_cos'])) # raster = raster.withColumn( # 'last_sin', when(raster['last_sin'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_sin'])) raster = change_column_type(raster, 'sample_count', IntegerType(), force=True) raster = change_column_type(raster, 'sox_me', FloatType(), force=True) raster = change_column_type(raster, 'sox_ae', FloatType(), force=True) raster = change_column_type(raster, 'co2_me', FloatType(), force=True) raster = change_column_type(raster, 'co2_ae', FloatType(), force=True) raster = change_column_type(raster, 'nox_me', FloatType(), force=True) raster = change_column_type(raster, 'nox_ae', FloatType(), force=True) # raster = change_column_type(raster, 'last_amp_v', FloatType(), # force=True) # raster = change_column_type(raster, 'last_cos', FloatType(), force=True) # raster = change_column_type(raster, 'last_sin', FloatType(), force=True) raster = ensure_columns_type(raster) # Write rasters raster.write.mode('overwrite').parquet(output_file) # Write metadata metadf.write.mode('overwrite').parquet(output_file + '.meta') return
def analyze(spark: SparkSession, input_data='emissions.parquet', db='ais', table='emis', host='localhost', port=5431, user='******', passwd='pass', table_type='ais', time_col='time', lon='longitude', lat='latitude', idx_fields="(imo, type)", ihs_table="ihs"): prt_high(""" Running export to PostreSQL ################################## Parameters - Input file: {} - User: {} - Password: Censored :P - Database: {} - Host: {} - Table: {} (type: {}) - Port: {} - Time column: {} - Latitude: {} - Longitude: {} - Idx Fields: {} - IHS table: {} ################################## """.format(input_data, user, db, host, table, table_type, port, time_col, lat, lon, str(idx_fields), ihs_table)) # Rename stage if 'mlflow' in sys.modules: mlflow.set_tag("mlflow.runName", "export_postgis_{}".format(table)) # Connections mode = "overwrite" url = "jdbc:postgresql://{}:{}/{}".format(host, port, db) properties = { "user": user, "password": passwd, "driver": "org.postgresql.Driver", } prt_info("Processing parquet file") emis = spark.read.parquet(input_data) conn = psycopg2.connect( "dbname='{}' user='******' host='{}' password='******' port={}".format( db, user, host, passwd, port)) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) # Allow CREATE INDEX # Remove materialized view if exists if table_type == "ais" or table_type == "ihs": clean_table_and_derived(conn, table) # Write table if table_type != "ihs": emis = emis.withColumn(time_col, col(time_col).cast(dataType=t.TimestampType())) prt_info("Exporting to JDBC") emis.write.jdbc(url=url, table=table, mode=mode, properties=properties) # Create indices conn = psycopg2.connect( "dbname='{}' user='******' host='{}' password='******' port={}".format( db, user, host, passwd, port)) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) # Allow CREATE INDEX if table_type != "ihs": add_indices(conn, table, idx_fields) add_geometry_field(conn, table, lon, lat) add_in_port_attr(conn, table) if table_type == "ais": create_ais_info_view(conn, table, ihs_table) conn.close() return
def analyze(spark: SparkSession, input_file='rasters.parquet', output_folder='/tmp/img', units='kg'): prt_high(""" Running image generation.") ##################################") Parameters") - Input file: {} - Output folder: {} """.format(input_file, output_folder)) df = spark.read.parquet(input_file) meta = spark.read.parquet(input_file + '.meta').toPandas() pol_vars = ["sox_me", "sox_ae", "co2_me", "co2_ae", "nox_me", "nox_ae"] pol_vars.extend(['hour', 'cell']) n_cols = meta.num_cols[0] n_rows = meta.num_rows[0] # Define the transformation of data (Bounding box) transform = from_origin(meta.min_lon[0], meta.max_lat[0], meta.cell_size[0], meta.cell_size[0]) # Produce a GeoTIFF and PDF per pollutant (Including ME, AE and # joint(ME+AE)) timestamps = df.select('hour').distinct().collect() for t in timestamps: # Generate the rasters for this timestep timestamp = t['hour'] data = df.select(pol_vars).filter(df.hour == t['hour']).toPandas() r = pandas_to_raster(data, pol_vars, n_rows, n_cols) if units == 'kg': r = r / (1000 * meta.cell_size_meters[0]**2) prt_info("Calculated raster sum: ", r.sum(axis=(0, 1, 2))) # joint raster ME+AE # TODO: r_me_ae declaration can be done previously pol_vars_me_ae = ['sox_', 'co2_', 'nox_'] shp = list(r.shape) shp[2] = shp[2] // 2 shp = tuple(shp) r_me_ae = np.zeros(shp, dtype=np.float32) for i in range(0, len(pol_vars_me_ae)): r_me_ae[:, :, i] = r[:, :, i * 2] + r[:, :, i * 2 + 1] # Save the rasters to GeoTIFF file_path = output_folder + '/' + str(timestamp) + '/' try: os.makedirs(file_path) except OSError as e: prt_warn(str(e)) prt_warn("Warning: folder exists" + file_path) for p in range(0, len(pol_vars)): raster_path = file_path + pol_vars[p] create_band_tiff(r, transform, p, raster_path + '.tif') for i in range(0, len(pol_vars_me_ae)): raster_path = file_path + pol_vars_me_ae[i] create_band_tiff(r_me_ae, transform, i, raster_path + '.tif') return
job (example: --job-args template=manual-email1 foo=bar") parser.add_argument( '--log', type=str, dest='log_level', default='WARN', help="Level of Spark logging (default = WARN).") parser.add_argument( '--extra-jars', type=str, dest='extra_jars', default='', help="Extra java jars to be added") parser.add_argument( '--hdfs', type=str, dest='hdfs', default='', help="HDFS endpoint") args = parser.parse_args() prt_info("Called with arguments: %s" % args) # Run main if 'mlflow' in sys.modules: prt_high("- Running with MLFlow") mlflow.start_run() # Setting run_name in start run doesn't work mlflow.set_tag("mlflow.runName", args.job_name) main(args) if 'mlflow' in sys.modules: prt_high("MLFlow: Shutting down run.") mlflow.end_run() except KeyboardInterrupt: print('Interrupted :(') try: sys.exit(130) except SystemExit: os._exit(130)