def processNewData(files, var_num, last_file): '''process, upload, and clean new data''' if files: #if files is empty list do nothing, if something in, convert netcdfs # Convert new files logging.info('Converting files') tifs = convert(files, var_num, last_file) # naming tiffs # Upload new files logging.info('Uploading files') dates = [getDateTime(tif) for tif in tifs ] #finding date for naming tiffs, returns string datestamps = [ datetime.datetime.strptime( date, DATE_FORMAT) #list comprehension/for loop for date in dates ] #returns list of datetime object assets = [getAssetName(date) for date in dates ] #create asset nema (imagecollect +tiffname) eeUtil.uploadAssets(tifs, assets, GS_FOLDER, datestamps, timeout=3000) #puts on GEE # Delete local files if DELETE_LOCAL: logging.info('Cleaning local TIFF files') for tif in tifs: os.remove(tif) return assets return []
def processNewData(files, var_num, last_date): ''' Process and upload clean new data INPUT files: list of file names for netcdfs that have been downloaded (list of strings) var_num: index number for variable we are currently processing (integer) last_date: name of file for last date of forecast (string) RETURN assets: list of file names for netcdfs that have been downloaded (list of strings) ''' # get name of variable we are processing files for var = VARS[var_num] # if files is empty list do nothing, otherwise, process data if files: logging.info('Converting files') # Convert netcdfs to tifs all_tifs, tifs = convert(files, var_num, last_date) # naming tiffs # get new list of date strings (in case order is different) from the tifs dates = [getDateTimeString(tif) for tif in tifs] # generate datetime objects for each tif date datestamps = [datetime.datetime.strptime(date, DATE_FORMAT) for date in dates] # Get a list of the names we want to use for the assets once we upload the files to GEE assets = [getAssetName(var, date) for date in dates] logging.info('Uploading files:') for asset in assets: logging.info(os.path.split(asset)[1]) # Upload new files (tifs) to GEE eeUtil.uploadAssets(tifs, assets, GS_FOLDER, datestamps, timeout=3000) # Delete local tif files logging.info('Cleaning local TIFF files') delete_local(ext='.tif') #if no new assets, return empty list else: assets = [] return assets
def processNewData(existing_dates): '''fetch, process, upload, and clean new data''' # 1. Determine which files to fetch new_dates = getNewDates(existing_dates) # 2. Fetch new files logging.info('Fetching files') files = fetch(new_dates) #get list of locations of netcdfs in docker container if files: #if files is empty list do nothing, if something in, convert netcdfs # 3. Convert new files logging.info('Converting files') tifs = convert(files) # naming tiffs # 4. Upload new files logging.info('Uploading files') dates = [getDate(tif) for tif in tifs] #finding date for naming tiffs, returns string datestamps = [datetime.datetime.strptime(date, DATE_FORMAT) #list comprehension/for loop for date in dates] #returns list of datetime object assets = [getAssetName(date) for date in dates] #create asset nema (imagecollect +tiffname) eeUtil.uploadAssets(tifs, assets, GS_FOLDER, datestamps) #puts on GEE # 5. Delete local files if DELETE_LOCAL: logging.info('Cleaning local files') for tif in tifs: os.remove(tif) for f in files: os.remove(f) return assets return []
def processNewData(existing_dates): '''fetch, process, upload, and clean new data''' # 1. Determine which files to fetch new_dates = getNewDates(existing_dates) # 2. Fetch new files logging.info('Fetching files') files = fetch(new_dates) if files: # 3. Convert new files logging.info('Converting files') tifs = convert(files) # 4. Upload new files logging.info('Uploading files') dates = [getDate(tif) for tif in tifs] datestamps = [ datetime.datetime.strptime(date, DATE_FORMAT) for date in dates ] assets = [getAssetName(date) for date in dates] eeUtil.uploadAssets(tifs, assets, GS_FOLDER, datestamps) # 5. Delete local files logging.info('Cleaning local files') for tif in tifs: os.remove(tif) for f in files: os.remove(f) return assets return []
def _processAssets(tifs, rw_id, varname): assets = [getAssetName(tif, rw_id, varname) for tif in tifs] dates = [getRasterDate(tif) for tif in tifs] # Set date to the end of the reported week, # -0 corresponding to Sunday at end of week datestamps = [datetime.datetime.strptime(date + '-0', DATE_FORMAT_ISO) for date in dates] eeUtil.uploadAssets(tifs, assets, GS_PREFIX.format(rw_id=rw_id, varname=varname), datestamps, timeout=3000) return assets
def processNewData(existing_dates): ''' fetch, process, upload, and clean new data INPUT existing_dates: list of dates we already have in GEE, in the format of the DATE_FORMAT variable (list of strings) RETURN assets: list of GEE assets that have been created (list of strings) ''' # Get list of new dates we want to try to fetch data for target_dates = getNewDates(existing_dates) # Fetch data file from source logging.info('Fetching files') nc_file = fetch(os.path.join(DATA_DIR, 'nc_file.nc')) # Get a list of dates of data available from netcdf file, in the format of the DATE_FORMAT variable available_dates = retrieve_formatted_dates(nc_file) # Fetch metadata from netcdf dtype, nodata = extract_metadata(nc_file) logging.info('type: ' + dtype) logging.info('nodata val: ' + str(nodata)) # If there are dates we expect to be able to fetch data for if target_dates: # Create new tifs from netcdf file for available dates logging.info('Converting files') sub_tifs = extract_subdata_by_date(nc_file, dtype, nodata, available_dates, target_dates) logging.info(sub_tifs) logging.info('Uploading files') # Get a list of the dates we have to upload from the tif file names dates = [getDate(tif) for tif in sub_tifs] # Get a list of datetimes from these dates for each of the dates we are uploading datestamps = [ datetime.datetime.strptime(date, DATE_FORMAT) for date in dates ] # Get a list of the names we want to use for the assets once we upload the files to GEE assets = [getAssetName(date) for date in dates] # Upload new files (tifs) to GEE eeUtil.uploadAssets(sub_tifs, assets, GS_FOLDER, datestamps, timeout=900) # Delete local files logging.info('Cleaning local files') os.remove(nc_file) for tif in sub_tifs: logging.debug('deleting: ' + tif) os.remove(tif) return assets return []
def processNewData(existing_dates): ''' fetch, process, upload, and clean new data INPUT existing_dates: list of dates we already have in GEE, in the format of the DATE_FORMAT variable (list of strings) RETURN new_assets: list of file names for hdfs that have been downloaded (list of strings) ''' # Get list of new dates we want to try to fetch data for target_dates = getNewDates(existing_dates) logging.debug('Target dates: {}'.format(target_dates)) # fetch new files logging.info('Fetching files') # Create an empty list of new years we want to try to fetch data for years = [] # go through each date we want to pull data for, pull out the year, and add it to the years list for date in target_dates: years.append(date[0:4]) # create a set for years to remove duplicate years from the list years = set(years) # create an empty list to store asset names that will be uploaded to GEE new_assets = [] # fetch data one year at a time for year in years: # Fetch new files fetch(year) # list all hdf files in the current directory files = glob.glob('*.hdf') # create an empty list to store tif filenames that were created from hdf files tifs = [] for _file in files: # get date from filename in the format YYYYMM date = getDateFromSource(_file) # if we don't have this date already in GEE if date not in existing_dates: logging.info('Converting file: {}'.format(_file)) # convert hdfs to tifs and store the tif filenames to a list tifs.append(convert(_file, date)) logging.info('Uploading files') # Get a list of the dates we have to upload from the tif file names dates = [getDate(tif) for tif in tifs] # Get a list of the names we want to use for the assets once we upload the files to GEE assets = [getAssetName(tif) for tif in tifs] # Get a list of datetimes from each of the dates we are uploading datestamps = [datetime.datetime.strptime(date, DATE_FORMAT) for date in dates] # Upload new files (tifs) to GEE eeUtil.uploadAssets(tifs, assets, GS_FOLDER, dates=datestamps, public=True, timeout=3000) # add list of assets uploaded to the new_assets list new_assets.extend(assets) # Delete local files clearDir()
def processNewData(existing_dates): '''fetch, process, upload, and clean new data''' # 1. Determine which years to read from the netCDF file target_dates = getNewTargetDates(existing_dates) # 2. Fetch datafile logging.info('Fetching files') nc_file = fetch(os.path.join(DATA_DIR, 'nc_file.nc')) available_dates = retrieve_formatted_dates(nc_file) dtype, nodata = extract_metadata(nc_file) logging.info('type: ' + dtype) logging.info('nodata val: ' + str(nodata)) if target_dates: # 3. Convert new files logging.info('Converting files') sub_tifs = extract_subdata_by_date(nc_file, dtype, nodata, available_dates, target_dates) logging.info(sub_tifs) # 4. Upload new files logging.info('Uploading files') dates = [getDate(tif) for tif in sub_tifs] datestamps = [ datetime.datetime.strptime(date, DATE_FORMAT) for date in dates ] assets = [getAssetName(date) for date in dates] eeUtil.uploadAssets(sub_tifs, assets, GS_FOLDER, datestamps, timeout=900) # 5. Delete local files logging.info('Cleaning local files') os.remove(nc_file) for tif in sub_tifs: logging.debug('deleting: ' + tif) os.remove(tif) return assets return []
def processNewData(existing_dates): ''' fetch, process, upload, and clean new data INPUT existing_dates: list of dates we already have in GEE, in the format of the DATE_FORMAT variable (list of strings) RETURN assets: list of file names for netcdfs that have been downloaded (list of strings) ''' # Get list of new dates we want to try to fetch data for new_dates = getNewDates(existing_dates) # Fetch new files logging.info('Fetching files') files = fetch(new_dates) # If we have successfully been able to fetch new data files if files: # Convert new files from netcdf to tif files logging.info('Converting files') tifs = convert(files) logging.info('Uploading files') # Get a list of the dates we have to upload from the tif file names dates = [getDate(tif) for tif in tifs] # Get a list of datetimes from these dates for each of the dates we are uploading datestamps = [ datetime.datetime.strptime(date, DATE_FORMAT) for date in dates ] # Get a list of the names we want to use for the assets once we upload the files to GEE assets = [getAssetName(date) for date in dates] # Upload new files (tifs) to GEE eeUtil.uploadAssets(tifs, assets, GS_FOLDER, datestamps) # Delete local files logging.info('Cleaning local files') for tif in tifs: os.remove(tif) for f in files: os.remove(f) return assets return []
def _processAssets1(tifs, rw_id, varname): assets = [getAssetName(tif, rw_id, varname) for tif in tifs] dates = [getRasterDate(tif) for tif in tifs] # Set date to the end of the reported week, # -0 corresponding to Sunday at end of week datestamps = [ datetime.datetime.strptime(date + '-0', DATE_FORMAT_ISO) for date in dates ] #try to upload data twice before quitting try_num = 1 while try_num <= 2: try: logging.info('Upload {} try number {}'.format(varname, try_num)) eeUtil.uploadAssets(tifs, assets, GS_PREFIX.format(rw_id=rw_id, varname=varname), datestamps, timeout=3000) break except: try_num += 1 return assets
def main(): ### # Configure logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) # Authenticate to GEE eeUtil.initJson() ### ### # Configure the ImageCollection you're going to add the rasters to ### GS_FOLDER = 'wat_038_modis_surface_water' EE_COLLECTION = 'wat_038_modis_surface_water' def ic(asset): return '{}/{}'.format(EE_COLLECTION, os.path.splitext(asset)[0]) def checkCreateCollection(collection): '''List assests in collection else create new collection''' if eeUtil.exists(collection): return eeUtil.ls(collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return [] existing_files = checkCreateCollection(EE_COLLECTION) ### # Obtain names of files to upload # Load file names for tifs and netcdfs ### # TIF_DATA_DIR = 'tifs' # os.chdir(TIF_DATA_DIR) # tifs = os.listdir('.') #[f for f in os.listdir('.') if os.path.splitext(f)[1] == '.tif'] # logging.info('TIFFs: {}'.format(tifs)) # # NC_DATA_DIR = 'ncs' # os.chdir(NC_DATA_DIR) # ncs = os.listdir('.') #[f for f in os.listdir('.') if os.path.splitext(f)[1] == '.tif'] # logging.info('NetCDFs: {}'.format(ncs)) ### # Priority 1: Load files to GEE and register w/ RW API ### from ftplib import FTP ftp = FTP('ftp.soilgrids.org') ftp.login() lines = [] ftp.retrlines('NLST', lines.append) data = [] ftp.retrlines('NLST data/recent', data.append) data = [f.split('/')[2] for f in data] logging.info("Data:") logging.info(data) import re pattern = re.compile('OCDENS_M_sl._250m.tif') soilcarbon = [f for f in data if pattern.match(f)] logging.info("SoilCarbon data:") logging.info(soilcarbon) #for datum in data: for datum in soilcarbon: logging.info('Processing {}'.format(datum)) with open('ncs/{}'.format(datum), 'wb') as f: ftp.retrbinary('RETR ' + datum, f.write) ### # Priority 2: Access pre-made SLDs for loading to layers ### ### ### # Retrieving legends for upload to RW API ### legends = [] ftp.retrlines('NLST legends', legends.append) slds = [ f.split('/')[1] for f in legends if os.path.splitext(f)[1] == '.sld' ] for sld in slds: logging.info('Processing {}'.format(sld)) f = open(os.path.join(os.getcwd(), sld), 'wb') ftp.retrbinary('RETR ' + sld, f.write) ftp.close() # Q: Is this possible? ### reduce(lambda obj, elem: obj.append(elem), ftp.retrlines('NLST'), []) ### # To upload to GEE, need to specify the date # Date formats vary by provider, some common ones include: ### ### Date encoded in asset name DATE_FORMAT = '%Y%j' # Year and week of year def getDate(asset): return asset[-7:] DATE_FORMAT = '%Y-%m-%d' # Year, month, day def getDate(asset): return asset[-10:] DATE_FORMAT = '%Y' # Year def getDate(asset): return asset[-4:] ### Constant year DATE_FORMAT = '%Y' # Year def getDate(asset): return '2017' ### Grab dates, create datestamps, upload through GEE dates = list(map(getDate, tifs)) datestamps = [datetime.strptime(date, DATE_FORMAT) for date in dates] asset_names = [ic(t) for t in tifs] eeUtil.uploadAssets(tifs, asset_names, GS_FOLDER, datestamps, public=True, timeout=30000)
def processNewRasterData(existing_dates, arctic_or_antarctic, new_or_hist, month=None): '''fetch, process, upload, and clean new data''' # 1. Determine which years to read from the ftp file if new_or_hist == 'new': target_dates = getNewTargetDates(existing_dates) or [] elif new_or_hist == 'hist': target_dates = getHistoricalTargetDates(existing_dates, month=month) or [] logging.debug(target_dates) # 2. Fetch datafile logging.info('Fetching {} files'.format(arctic_or_antarctic)) orig_tifs = [] reproj_tifs = [] if arctic_or_antarctic == 'arctic': s_srs = 'EPSG:3411' extent = '-180 50 180 89.75' else: s_srs = 'EPSG:3412' extent = '-180 -89.75 180 -50' for date in target_dates: if date not in existing_dates: orig_file = fetch(SOURCE_URL_MEASUREMENT, arctic_or_antarctic, date) reproj_file = reproject(orig_file, s_srs=s_srs, extent=extent) orig_tifs.append(os.path.join(DATA_DIR, orig_file)) reproj_tifs.append(os.path.join(DATA_DIR, reproj_file)) logging.debug('New files: orig {}, reproj {}'.format( orig_file, reproj_file)) # 3. Upload new files logging.info('Uploading {} files'.format(arctic_or_antarctic)) orig_assets = [getAssetName(tif, 'orig', new_or_hist) for tif in orig_tifs] reproj_assets = [ getAssetName(tif, 'reproj', new_or_hist) for tif in reproj_tifs ] dates = [getRasterDate(tif) for tif in reproj_tifs] datestamps = [ datetime.datetime.strptime(date, DATE_FORMAT) # list comprehension/for loop for date in dates ] # returns list of datetime object eeUtil.uploadAssets(orig_tifs, orig_assets, GS_PREFIX, datestamps, timeout=3000) eeUtil.uploadAssets(reproj_tifs, reproj_assets, GS_PREFIX, datestamps, timeout=3000) # 4. Delete local files for tif in orig_tifs: logging.debug('Deleting: {}'.format(tif)) os.remove(tif) for tif in reproj_tifs: logging.debug('Deleting: {}'.format(tif)) os.remove(tif) return orig_assets, reproj_assets
def main(): ### # Configure logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) # Authenticate to GEE eeUtil.initJson() ### ### # Configure the ImageCollection you're going to add the rasters to ### GS_FOLDER = 'foo_054_soil_organic_carbon' EE_COLLECTION = 'foo_054_soil_organic_carbon' def ic(asset): return '{}/{}'.format(EE_COLLECTION, os.path.splitext(asset)[0]) def checkCreateCollection(collection): '''List assests in collection else create new collection''' if eeUtil.exists(collection): return eeUtil.ls(collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return [] existing_files = checkCreateCollection(EE_COLLECTION) ### # Obtain names of files to upload ### ### # Priority 1: Load files to GEE and register w/ RW API ### from ftplib import FTP ftp = FTP('ftp.soilgrids.org') ftp.login() folders = [] ftp.retrlines('NLST', folders.append) logging.info("Folders:") logging.info(folders) data = [] ftp.retrlines('NLST data/recent', data.append) data = [f.split('/')[2] for f in data] logging.info("Data:") logging.info(data) import re # Matches soil carbon for different depths: # 0, 5, 15, 30, 60, 100, 200 cm depth tifs available, # labeled sl1 - sl7 # http://data.isric.org/geonetwork/srv/eng/catalog.search;jsessionid=A5137293CC6B3D96CBA35808CA155341#/metadata/98062ae9-911d-4e04-80a9-e4b480f87799 pattern = re.compile('OCSTHA_M_sd._250m.tif') soilcarbon = [f for f in data if pattern.match(f)] logging.info("SoilCarbon data:") logging.info(soilcarbon) SOURCE_URL = 'ftp://ftp.soilgrids.org/data/recent/{f}' def getUrl(lvl): return SOURCE_URL.format(f=lvl) def getFilename(lvl): return 'tifs/{}'.format(lvl) ## Download with ftplib # Track progress: # https://stackoverflow.com/questions/21343029/how-do-i-keep-track-of-percentage-downloaded-with-ftp-retrbinary def download_file(f, block, totalSize, sizeWritten): f.write(block) sizeWritten.append(len(block)) logging.info("{} = size written, {} = total size".format( sum(sizeWritten), totalSize)) percentComplete = sum(sizeWritten) / totalSize logging.info("{} percent complete".format(percentComplete)) for data in soilcarbon: logging.info('Processing {}'.format(data)) totalSize = ftp.size('data/recent/' + data) sizeWritten = [] with open('tifs/{}'.format(data), 'wb') as f: ftp.retrbinary( 'RETR data/recent/' + data, lambda block: download_file(f, block, totalSize, sizeWritten)) ### ## Download with urllib # def fetch(files): # '''Fetch files by datestamp''' # tifs = [] # for lvl in files: # url = getUrl(lvl) # f = getFilename(lvl) # logging.debug('Fetching {}'.format(url)) # # New data may not yet be posted # try: # urllib.request.urlretrieve(url, f) # tifs.append(f) # except Exception as e: # logging.warning('Could not fetch {}'.format(url)) # logging.debug(e) # return tifs # # # tifs = fetch(soilcarbon) ### # To upload to GEE, need to specify the date # Date formats vary by provider, some common ones include: ### ### Constant year DATE_FORMAT = '%Y' # Year def getDate(asset): return '2017' ### Grab dates, create datestamps, upload through GEE dates = list(map(getDate, tifs)) datestamps = [datetime.strptime(date, DATE_FORMAT) for date in dates] asset_names = [ic(t) for t in tifs] eeUtil.uploadAssets(tifs, asset_names, GS_FOLDER, datestamps, public=True, timeout=30000) ### # Upload to RW API # For this and writing in the SLDs, could use Brookie's class # Would match the SLD name to the tif name, pair them and upload (like a zip) ### API_TOKEN = os.environ.get('rw_api_token', None) def createHeaders(): return { 'content-type': "application/json", 'authorization': "Bearer {}".format(AUTH_TOKEN) } def upload_ic_to_backoffice(wri_id, imageCollectionName, datasetName): ds_specs = { "connectorType": "rest", "provider": "gee", "tableName": imageCollectionName, "application": ["rw"], "geoInfo": True, "type": "raster", "name": "{}_{}".format(wri_id, datasetName) } create_res = req.request( "POST", 'https://staging-api.globalforestwatch.org/v1/dataset', data=json.dumps(ds_specs), headers=createHeaders()) logging.info(create_res.text) return create_res.json()['data']['id'] rw_id = upload_ic_to_backoffice('foo.054', EE_COLLECTION, 'Soil Organic Carbon') ### # Priority 2: Access pre-made SLDs for loading to layers ### ### ### # Retrieving legends for upload to RW API ### legends = [] ftp.retrlines('NLST legends', legends.append) slds = [ f.split('/')[1] for f in legends if os.path.splitext(f)[1] == '.sld' ] for sld in slds: logging.info('Processing {}'.format(sld)) with open('slds/{}'.format(sld), 'wb') as f: ftp.retrbinary('RETR legends/' + sld, f.write) ftp.close()
eeUtil.createFolder(collection, True, public=True) return [] existing_files = checkCreateCollection(EE_COLLECTION) # Make sure your data is in the rasters folder DATA_DIR = 'rasters' EXTENSIONS = ['.tif', '.nc'] os.chdir(DATA_DIR) tifs = [f for f in os.listdir('.') if os.path.splitext(f)[1] in EXTENSIONS] logging.info('TIFS: {}'.format(tifs)) # Update this manually, or with a function of the tif name DATE_FORMAT = '%Y' dates = ['2010', '2010', '2010', '2010', '2010', '2010'] datestamps = [datetime.strptime(date, DATE_FORMAT) for date in dates] def ic(asset): return '{}/{}'.format(EE_COLLECTION, os.path.splitext(asset)[0]) asset_names = [ic(t) for t in tifs] eeUtil.uploadAssets(tifs, asset_names, GS_FOLDER, datestamps, public=True, timeout=30000)