raw_data_file = os.path.join(data_dir, os.path.basename(url))
urllib.request.urlretrieve(url, raw_data_file)
'''
Process data
'''

# netcdf subdatasets that will be used in processing
subdatasets = [
    'stdv_maxmonth',
    'stdv_annual',
    'mask',
]

logger.info('Extracting relevant GeoTIFFs from source NetCDF')
# convert netcdf to individual tif files for each of the subdatasets specified
tifs = util_files.convert_netcdf(raw_data_file, subdatasets)

logger.info('Mask extracted GeoTIFFs using dataset mask')
# create dict linking subdatasets from the netcdf to the geotiffs that now contain each
sds_file_dict = dict(zip(subdatasets, tifs))

# set nodata value for masking
nodata = -128

# define netcdf subdatasets that will be uploaded as bands to GEE
band_ids = [
    'stdv_maxmonth',
    'stdv_annual',
]

# cycle through target tifs (corresponding to bands) and mask them with mask tif
target_date = '20200623'

for key, val in data_dict.items():
    val['url'] = val['url_template'].format(target_year, target_date)

logger.info('Downloading raw data')
for key, val in data_dict.items():
    url = val['url']
    raw_data_file = os.path.join(data_dir, os.path.basename(url))
    urllib.request.urlretrieve(url, raw_data_file)
    val['raw_data_file'] = raw_data_file
    logger.debug('(' + key + ')' + 'Raw data file path: ' + raw_data_file)

logger.info('Extracting relevant GeoTIFFs from source NetCDFs')
for key, val in data_dict.items():
    val['tifs'] = util_files.convert_netcdf(val['raw_data_file'], val['sds'])

# no masking necessary, in theory
# do need to scale most of the datasets though
alltifs = []
for key, val in data_dict.items():
    if 'scale_factor' in val and float(val['scale_factor'] != 1.0):
        finaltifs = []
        for tif in val['tifs']:
            finaltifs.append(util_files.scale_geotiff(tif))
    else:
        finaltifs = val['tifs']
    val['finaltifs'] = finaltifs
    alltifs.extend(finaltifs)

logger.info(
# download the data from the source
raw_data_file = [os.path.join(data_dir,os.path.basename(url)) for url in url_list]
for url, file in zip(url_list, raw_data_file):
     urllib.request.urlretrieve(url, file)

# unzip source data
raw_data_file_unzipped = [file[:-3] for file in raw_data_file]
for file,file_unzipped in zip(raw_data_file,raw_data_file_unzipped):
    with gzip.open(file, 'rb') as f_in:
        with open(file_unzipped, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# convert the netcdf files to tif files
for raw_file in raw_data_file_unzipped:
    util_files.convert_netcdf(raw_file, ['precip'])
processed_data_file = [os.path.join(raw_file[:-3]+'_precip.tif') for raw_file in raw_data_file_unzipped]

processed_data_annual=[os.path.join(data_dir,'full_data_annual_v2020_'+str(year)+'_025_precip.tif') for year in range(1891,2020)]
n_layers=[int(rasterio.open(file).meta['count']/12) for file in processed_data_file]

# calculate annual total precipitation
for id, file in enumerate(processed_data_file,start=0):
    with rasterio.open(file) as src0:
        # update metedata for annual aggregation
        meta = src0.meta
        meta.update(count = 1)
        meta.update(nodata = meta['nodata']*12)
        # sum and export annual total precipitation as tif file
        for i in range(int(src0.meta['count']/12)):
            with rasterio.open(processed_data_annual[sum(n_layers[:id])+i], 'w', **meta) as dst:
    for year in range(1992, 2020)
]
for raw_data in raw_data_file:
    zip_ref = ZipFile(raw_data, 'r')
    zip_ref.extractall(data_dir)
    zip_ref.close()
raw_data_file_unzipped = glob.glob(os.path.join(data_dir, '*.nc'))
# sort file by year
raw_data_file_unzipped = sorted(raw_data_file_unzipped,
                                key=lambda x: x.split('-')[7])
'''
Process data
'''
# convert the netcdf files to tif files
for raw_file in raw_data_file_unzipped:
    util_files.convert_netcdf(raw_file, ['lccs_class'])
processed_data_file = [
    os.path.join(raw_file[:-3] + '_lccs_class.tif')
    for raw_file in raw_data_file_unzipped
]
'''
Upload processed data to Google Earth Engine
'''
# set up uploading chunk size
# The default setting requires an uploading speed at 10MB/min. Reduce the chunk size, if the network condition is not good.
storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB

logger.info('Uploading processed data to Google Cloud Storage.')
# set up Google Cloud Storage project and bucket objects
gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT"))
Exemple #5
0
#Define years to separate netcdf for
years = np.arange(1950, 2021)

#Loop through years
for year in years:
    #define command to separate netcdf by one year
    cmd = ('cdo -selyear,{} {} {}'.format(
        year, annual_data_file, processed_data_file_convention.format(year)))
    subprocess.check_output(cmd, shell=True)

# convert the netcdf files to tif files
processed_data_files = [
    processed_data_file_convention.format(year) for year in years
]
for raw_file in processed_data_files:
    util_files.convert_netcdf(raw_file, ['air'])
processed_data_annual = [
    os.path.join(raw_file[:-3] + '_air.tif')
    for raw_file in processed_data_files
]
'''
Upload processed data to Google Earth Engine
'''
logger.info('Uploading processed data to Google Cloud Storage.')
# set up Google Cloud Storage project and bucket objects
print(os.environ.get("CLOUDSDK_CORE_PROJECT"))
gcsClient = storage.Client(os.environ.get("CLOUDSDK_CORE_PROJECT"))
gcsBucket = gcsClient.bucket(os.environ.get("GEE_STAGING_BUCKET"))

# upload files to Google Cloud Storage
gcs_uris = util_cloud.gcs_upload(processed_data_annual,
Exemple #6
0
Process data
'''
# netcdf subdatasets that will be used in processing
subdatasets = [
    'n_gt0',  # The number of events for which the thermal stress, measured by Degree Heating Weeks, exceeded 0 degC-weeks.
    'n_ge4',  # The number of events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 4 degC-weeks.
    'n_ge8',  # The number of events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 8 degC-weeks.
    'rp_gt0',  # The average time between events for which the thermal stress, measured by Degree Heating Weeks, exceeded 0 degC-weeks.
    'rp_ge4',  # The average time between events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 4 degC-weeks.
    'rp_ge8'  # The average time between events for which the thermal stress, measured by Degree Heating Weeks, reached or exceeded 8 degC-weeks.
]
mask_sds = ['mask']

logger.info('Extracting relevant GeoTIFFs from source NetCDF')
# convert netcdf to individual tif files for each of the subdatasets specified
tifs = util_files.convert_netcdf(raw_data_file, subdatasets)
mask = util_files.convert_netcdf(raw_data_file, mask_sds)[0]

logger.info('Masking GeoTIFFs to reflect dataset coverage')
nodata = -128
i = 0
maskedtifs = []
for target in tifs:
    sds = subdatasets[i]
    maskedtif = os.path.join(data_dir,
                             os.path.basename(target)[:-4] + '_masked.tif')
    util_files.mask_geotiff(target, mask, maskedtif, nodata=nodata)
    maskedtifs.append(maskedtif)

# generate a name for processed tif
processed_data_file = os.path.join(data_dir, dataset_name + '.tif')