raw_data_file = os.path.join(data_dir, os.path.basename(url)) urllib.request.urlretrieve(url, raw_data_file) ''' Process data ''' # netcdf subdatasets that will be used in processing subdatasets = [ 'stdv_maxmonth', 'stdv_annual', 'mask', ] logger.info('Extracting relevant GeoTIFFs from source NetCDF') # convert netcdf to individual tif files for each of the subdatasets specified tifs = util_files.convert_netcdf(raw_data_file, subdatasets) logger.info('Mask extracted GeoTIFFs using dataset mask') # create dict linking subdatasets from the netcdf to the geotiffs that now contain each sds_file_dict = dict(zip(subdatasets, tifs)) # set nodata value for masking nodata = -128 # define netcdf subdatasets that will be uploaded as bands to GEE band_ids = [ 'stdv_maxmonth', 'stdv_annual', ] # cycle through target tifs (corresponding to bands) and mask them with mask tif
target_date = '20200623' for key, val in data_dict.items(): val['url'] = val['url_template'].format(target_year, target_date) logger.info('Downloading raw data') for key, val in data_dict.items(): url = val['url'] raw_data_file = os.path.join(data_dir, os.path.basename(url)) urllib.request.urlretrieve(url, raw_data_file) val['raw_data_file'] = raw_data_file logger.debug('(' + key + ')' + 'Raw data file path: ' + raw_data_file) logger.info('Extracting relevant GeoTIFFs from source NetCDFs') for key, val in data_dict.items(): val['tifs'] = util_files.convert_netcdf(val['raw_data_file'], val['sds']) # no masking necessary, in theory # do need to scale most of the datasets though alltifs = [] for key, val in data_dict.items(): if 'scale_factor' in val and float(val['scale_factor'] != 1.0): finaltifs = [] for tif in val['tifs']: finaltifs.append(util_files.scale_geotiff(tif)) else: finaltifs = val['tifs'] val['finaltifs'] = finaltifs alltifs.extend(finaltifs) logger.info(
# download the data from the source raw_data_file = [os.path.join(data_dir,os.path.basename(url)) for url in url_list] for url, file in zip(url_list, raw_data_file): urllib.request.urlretrieve(url, file) # unzip source data raw_data_file_unzipped = [file[:-3] for file in raw_data_file] for file,file_unzipped in zip(raw_data_file,raw_data_file_unzipped): with gzip.open(file, 'rb') as f_in: with open(file_unzipped, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # convert the netcdf files to tif files for raw_file in raw_data_file_unzipped: util_files.convert_netcdf(raw_file, ['precip']) processed_data_file = [os.path.join(raw_file[:-3]+'_precip.tif') for raw_file in raw_data_file_unzipped] processed_data_annual=[os.path.join(data_dir,'full_data_annual_v2020_'+str(year)+'_025_precip.tif') for year in range(1891,2020)] n_layers=[int(rasterio.open(file).meta['count']/12) for file in processed_data_file] # calculate annual total precipitation for id, file in enumerate(processed_data_file,start=0): with rasterio.open(file) as src0: # update metedata for annual aggregation meta = src0.meta meta.update(count = 1) meta.update(nodata = meta['nodata']*12) # sum and export annual total precipitation as tif file for i in range(int(src0.meta['count']/12)): with rasterio.open(processed_data_annual[sum(n_layers[:id])+i], 'w', **meta) as dst:
for year in range(1992, 2020) ] for raw_data in raw_data_file: zip_ref = ZipFile(raw_data, 'r') zip_ref.extractall(data_dir) zip_ref.close() raw_data_file_unzipped = glob.glob(os.path.join(data_dir, '*.nc')) # sort file by year raw_data_file_unzipped = sorted(raw_data_file_unzipped, key=lambda x: x.split('-')[7]) ''' Process data ''' # convert the netcdf files to tif files for raw_file in raw_data_file_unzipped: util_files.convert_netcdf(raw_file, ['lccs_class']) processed_data_file = [ os.path.join(raw_file[:-3] + '_lccs_class.tif') for raw_file in raw_data_file_unzipped ] ''' Upload processed data to Google Earth Engine ''' # set up uploading chunk size # The default setting requires an uploading speed at 10MB/min. Reduce the chunk size, if the network condition is not good. storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB logger.info('Uploading processed data to Google Cloud Storage.') # set up Google Cloud Storage project and bucket objects gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT"))
#Define years to separate netcdf for years = np.arange(1950, 2021) #Loop through years for year in years: #define command to separate netcdf by one year cmd = ('cdo -selyear,{} {} {}'.format( year, annual_data_file, processed_data_file_convention.format(year))) subprocess.check_output(cmd, shell=True) # convert the netcdf files to tif files processed_data_files = [ processed_data_file_convention.format(year) for year in years ] for raw_file in processed_data_files: util_files.convert_netcdf(raw_file, ['air']) processed_data_annual = [ os.path.join(raw_file[:-3] + '_air.tif') for raw_file in processed_data_files ] ''' Upload processed data to Google Earth Engine ''' logger.info('Uploading processed data to Google Cloud Storage.') # set up Google Cloud Storage project and bucket objects print(os.environ.get("CLOUDSDK_CORE_PROJECT")) gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT")) gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET")) # upload files to Google Cloud Storage gcs_uris = util_cloud.gcs_upload(processed_data_annual,
Process data ''' # netcdf subdatasets that will be used in processing subdatasets = [ 'n_gt0', # The number of events for which the thermal stress, measured by Degree Heating Weeks, exceeded 0 degC-weeks. 'n_ge4', # The number of events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 4 degC-weeks. 'n_ge8', # The number of events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 8 degC-weeks. 'rp_gt0', # The average time between events for which the thermal stress, measured by Degree Heating Weeks, exceeded 0 degC-weeks. 'rp_ge4', # The average time between events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 4 degC-weeks. 'rp_ge8' # The average time between events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 8 degC-weeks. ] mask_sds = ['mask'] logger.info('Extracting relevant GeoTIFFs from source NetCDF') # convert netcdf to individual tif files for each of the subdatasets specified tifs = util_files.convert_netcdf(raw_data_file, subdatasets) mask = util_files.convert_netcdf(raw_data_file, mask_sds)[0] logger.info('Masking GeoTIFFs to reflect dataset coverage') nodata = -128 i = 0 maskedtifs = [] for target in tifs: sds = subdatasets[i] maskedtif = os.path.join(data_dir, os.path.basename(target)[:-4] + '_masked.tif') util_files.mask_geotiff(target, mask, maskedtif, nodata=nodata) maskedtifs.append(maskedtif) # generate a name for processed tif processed_data_file = os.path.join(data_dir, dataset_name + '.tif')