def checkCreateCollection(VARS): existing_dates = [] existing_dates_by_var = [] for VAR in VARS: # For one of the variables, get the date of the most recent data set # All variables come from the same file # If we have one for a particular data, we should have them all collection = EE_COLLECTION_GEN.format(var=VAR) if not eeUtil.exists(PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder(PARENT_FOLDER) if eeUtil.exists(collection): existing_assets = eeUtil.ls(collection) dates = [getDate_GEE(a) for a in existing_assets] existing_dates_by_var.append(dates) for date in dates: if date not in existing_dates: existing_dates.append(date) else: existing_dates_by_var.append([]) logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True) existing_dates_all_vars = copy.copy(existing_dates) for date in existing_dates: count = sum(x.count(date) for x in existing_dates_by_var) / len(TIME_HOURS) if count < len(VARS): existing_dates_all_vars.remove(date) return existing_dates_all_vars, existing_dates_by_var
def checkCreateCollection(collection): '''List assests in collection else create new collection''' if not eeUtil.exists(PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder(PARENT_FOLDER, public=True) if eeUtil.exists(collection): return eeUtil.ls(collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return []
def clearCollectionMultiVar(): ''' Clear the GEE collection for all variables ''' logging.info('Clearing collections.') for var_num in range(len(VARS)): # get name of variable we are clearing GEE collections for var = VARS[var_num] # get name of GEE collection for variable collection = getCollectionName(var) # if the collection exists, if eeUtil.exists(collection): # remove the / from the beginning of the collection name to be used in ee module if collection[0] == '/': collection = collection[1:] # pull the image collection a = ee.ImageCollection(collection) # check how many assets are in the collection collection_size = a.size().getInfo() # if there are assets in the collection if collection_size > 0: # create a list of assets in the collection list = a.toList(collection_size) # delete each asset for item in list.getInfo(): ee.data.deleteAsset(item['id'])
def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) logging.info('STARTING') # Initialize eeUtil eeUtil.initJson() # Clear the GEE collection, if specified above if CLEAR_COLLECTION_FIRST: if eeUtil.exists(EE_COLLECTION): eeUtil.removeAsset(EE_COLLECTION, recursive=True) # Check if collection exists, create it if it does not # If it exists return the list of assets currently in the collection existing_assets = checkCreateCollection(EE_COLLECTION) existing_dates = [getDate(a) for a in existing_assets] # Fetch, process, and upload the new data new_assets = processNewData(existing_dates) # Get the dates of the new data we have added new_dates = [getDate(a) for a in new_assets] logging.info('Previous assets: {}, new: {}, max: {}'.format( len(existing_dates), len(new_dates), MAX_ASSETS)) # Delete excess assets deleteExcessAssets(existing_dates + new_dates, MAX_ASSETS) # Update Resource Watch updateResourceWatch() logging.info('SUCCESS')
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') # Initialize eeUtil and clear collection in GEE if desired eeUtil.initJson() if CLEAR_COLLECTION_FIRST: if eeUtil.exists(EE_COLLECTION): eeUtil.removeAsset(EE_COLLECTION, recursive=True) # 1. Check if collection exists and create existing_assets = checkCreateCollection( EE_COLLECTION) #make image collection if doesn't have one existing_dates = [getDate(a) for a in existing_assets] # 2. Fetch, process, stage, ingest, clean new_assets = processNewData(existing_dates) new_dates = [getDate(a) for a in new_assets] # 3. Delete old assets existing_dates = existing_dates + new_dates logging.info('Existing assets: {}, new: {}, max: {}'.format( len(existing_dates), len(new_dates), MAX_ASSETS)) deleteExcessAssets(existing_dates, MAX_ASSETS) logging.info(new_dates) # Get most recent update date most_recent_date = get_most_recent_date(EE_COLLECTION) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('SUCCESS')
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') # Initialize eeUtil eeUtil.initJson() # 1. Check if collection exists and create if CLEAR_COLLECTION_FIRST: if eeUtil.exists(EE_COLLECTION): eeUtil.removeAsset(EE_COLLECTION, recursive=True) existing_assets = checkCreateCollection(EE_COLLECTION) existing_dates = [getDate(a) for a in existing_assets] # 2. Fetch, process, stage, ingest, clean new_assets = processNewData(existing_dates) new_dates = [getDate(a) for a in new_assets] # 3. Delete old assets existing_dates = existing_dates + new_dates logging.info('Existing assets: {}, new: {}, max: {}'.format( len(existing_dates), len(new_dates), MAX_ASSETS)) deleteExcessAssets(existing_dates, MAX_ASSETS) ### logging.info('SUCCESS')
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') ### 0. Initialize GEE eeUtil.initJson() ### 1. Create collection names, clear if desired collections = {} for rw_id, varname in ASSET_NAMES.items(): collections[rw_id] = EE_COLLECTION.format(rw_id=rw_id, varname=varname) if CLEAR_COLLECTION_FIRST: for collection in collections.values(): if eeUtil.exists(collection): eeUtil.removeAsset(collection, recursive=True) ### 2. Grab existing assets and their dates existing_assets = {} for rw_id, coll in collections.items(): existing_assets[rw_id] = checkCreateCollection(coll) existing_dates = {} for rw_id, ex_assets in existing_assets.items(): existing_dates[rw_id] = list(map(getRasterDate, ex_assets)) # This will be a list of objects new_assets = processNewRasterData(existing_dates) new_dates = {} for rw_id, nw_assets in new_assets.items(): new_dates[rw_id] = list(map(getRasterDate, nw_assets)) ### 5. Delete old assets for rw_id, collection in collections.items(): e = existing_dates[rw_id] n = new_dates[rw_id] if rw_id in new_dates else [] total = e + n logging.info('Existing assets in {}: {}, new: {}, max: {}'.format( rw_id, len(e), len(n), MAX_DATES)) deleteExcessAssets(total, rw_id, ASSET_NAMES[rw_id], MAX_DATES) # Get most recent update date for collection, id in DATASET_IDS.items(): most_recent_date = get_most_recent_date(collection) current_date = getLastUpdate(id) if current_date != most_recent_date: logging.info('Updating last update date and flushing cache.') # Update data set's last update date on Resource Watch lastUpdateDate(id, most_recent_date) # get layer ids and flush tile cache for each layer_ids = getLayerIDs(id) for layer_id in layer_ids: flushTileCache(layer_id) logging.info('SUCCESS')
def checkCreateCollection(collection): ''' List assests in collection if it exists, else create new collection INPUT collection: GEE collection to check or create (string) RETURN list of assets in collection (list of strings) ''' # if parent folder does not exist, create it if not eeUtil.exists('/'+PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder('/'+PARENT_FOLDER, public=True) # if collection exists, return list of assets in collection if eeUtil.exists(collection): return eeUtil.ls(collection) # if collection does not exist, create it and return an empty list (because no assets are in the collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return []
def clearCollection(): logging.info('Clearing collections.') for var_num in range(len(VARS)): var = VARS[var_num] collection = EE_COLLECTION_GEN.format(var=var) if eeUtil.exists(collection): if collection[0] == '/': collection = collection[1:] a = ee.ImageCollection(collection) collection_size = a.size().getInfo() if collection_size > 0: list = a.toList(collection_size) for item in list.getInfo(): ee.data.deleteAsset(item['id'])
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') ### 0. Initialize GEE eeUtil.initJson() ### 1. Create collection names, clear if desired collections = {} for rw_id, varname in ASSET_NAMES.items(): collections[rw_id] = EE_COLLECTION.format(rw_id=rw_id, varname=varname) if CLEAR_COLLECTION_FIRST: for collection in collections.values(): if eeUtil.exists(collection): eeUtil.removeAsset(collection, recursive=True) ### 2. Grab existing assets and their dates existing_assets = {} for rw_id, coll in collections.items(): existing_assets[rw_id] = checkCreateCollection(coll) existing_dates = {} for rw_id, ex_assets in existing_assets.items(): existing_dates[rw_id] = list(map(getRasterDate, ex_assets)) # This will be a list of objects new_assets = processNewRasterData(existing_dates) new_dates = {} for rw_id, nw_assets in new_assets.items(): new_dates[rw_id] = list(map(getRasterDate, nw_assets)) ### 5. Delete old assets for rw_id, collection in collections.items(): e = existing_dates[rw_id] n = new_dates[rw_id] if rw_id in new_dates else [] total = e + n logging.info('Existing assets in {}: {}, new: {}, max: {}'.format( rw_id, len(e), len(n), MAX_DATES)) deleteExcessAssets(total, rw_id, ASSET_NAMES[rw_id], MAX_DATES) ### logging.info('SUCCESS')
def main(): global VAR global BAND global EE_COLLECTION global PARENT_FOLDER global FILENAME global DAYS_TO_AVERAGE logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) # Initialize eeUtil and ee eeUtil.initJson() initialize_ee() if DAYS_TO_AVERAGE == 1: PARENT_FOLDER = COLLECTION EE_COLLECTION_GEN = COLLECTION + '/{var}' FILENAME = COLLECTION + '_{var}_{date}' else: PARENT_FOLDER = COLLECTION + '_{days}day_avg'.format( days=DAYS_TO_AVERAGE) EE_COLLECTION_GEN = COLLECTION + '_%sday_avg/{var}' % DAYS_TO_AVERAGE FILENAME = COLLECTION + '_{days}day_avg_{var}_{date}' for i in range(len(VARS)): VAR = VARS[i] logging.info('STARTING {var}'.format(var=VAR)) BAND = BANDS[i] EE_COLLECTION = EE_COLLECTION_GEN.format(var=VAR) # Clear collection in GEE if desired if CLEAR_COLLECTION_FIRST: if eeUtil.exists(EE_COLLECTION): eeUtil.removeAsset(EE_COLLECTION, recursive=True) # 1. Check if collection exists and create existing_assets = checkCreateCollection( EE_COLLECTION) #make image collection if doesn't have one existing_dates = [getDate(a) for a in existing_assets] # 2. Fetch, process, stage, ingest, clean new_assets = processNewData(existing_dates) new_dates = [getDate(a) for a in new_assets] # 3. Delete old assets existing_dates = existing_dates + new_dates logging.info('Existing assets: {}, new: {}, max: {}'.format( len(existing_dates), len(new_dates), MAX_ASSETS)) deleteExcessAssets(existing_dates, MAX_ASSETS) # Get most recent update date most_recent_date = get_most_recent_date(EE_COLLECTION) lastUpdateDate(DATASET_IDS[VAR], most_recent_date) logging.info('SUCCESS for {var}'.format(var=VAR))
def checkCreateCollection(VARS): ''' List assets in collection if it exists, else create new collection INPUT VARS: list variables (as named in netcdf) that we want to check collections for (list of strings) RETURN existing_dates_all_vars: list of dates, in the format of the DATE_FORMAT variable, that exist for all variable collections in GEE (list of strings) existing_dates_by_var: list of dates, in the format of the DATE_FORMAT variable, that exist for each individual variable collection in GEE (list containing list of strings for each variable) ''' # create a master list (not variable-specific) to store the dates for which all variables already have data for existing_dates = [] # create an empty list to store the dates that we currently have for each AQ variable # will be used in case the previous script run crashed before completing the data upload for every variable. existing_dates_by_var = [] # loop through each variables that we want to pull for var in VARS: # For one of the variables, get the date of the most recent dataset # All variables come from the same file # If we have one for a particular data, we should have them all collection = getCollectionName(var) # Check if folder to store GEE collections exists. If not, create it. # we will make one collection per variable, all stored in the parent folder for the dataset if not eeUtil.exists(PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder(PARENT_FOLDER) # If the GEE collection for a particular variable exists, get a list of existing assets if eeUtil.exists(collection): existing_assets = eeUtil.ls(collection) # get a list of the dates from these existing assets dates = [getDate_GEE(a) for a in existing_assets] # append this list of dates to our list of dates by variable existing_dates_by_var.append(dates) # for each of the dates that we have for this variable, append the date to the master # list of which dates we already have data for (if it isn't already in the list) for date in dates: if date not in existing_dates: existing_dates.append(date) # If the GEE collection does not exist, append an empty list to our list of dates by variable else: existing_dates_by_var.append([]) # create a collection for this variable logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True) ''' We want make sure all variables correctly uploaded the data on the last run. To do this, we will check that we have the correct number of appearances of the data in our GEE collection. If we do not, we will want to re-upload this date's data. ''' # Create a copy of the master list of dates that will store the dates that were properly uploaded for all variables. existing_dates_all_vars = copy.copy(existing_dates) for date in existing_dates: # check how many times each date appears in our list of dates by variable date_count = sum(x.count(date) for x in existing_dates_by_var) # divide this count by the number of time intervals we have (because the date will be # repeated for each time) count = date_count / len(TIME_HOURS) # If this count is less than the number of variables we have, one of the variables did not finish # uploading for this date, and we need to re-upload this file. if count < len(VARS): # remove this from the list of existing dates for all variables existing_dates_all_vars.remove(date) return existing_dates_all_vars, existing_dates_by_var
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') ### 1. Initialize eeUtil eeUtil.initJson() ### 2. Create collection names, clear if desired arctic_collection_orig = EE_COLLECTION.format(arctic_or_antarctic='arctic', orig_or_reproj='orig') arctic_collection_reproj = EE_COLLECTION.format( arctic_or_antarctic='arctic', orig_or_reproj='reproj') antarctic_collection_orig = EE_COLLECTION.format( arctic_or_antarctic='antarctic', orig_or_reproj='orig') antarctic_collection_reproj = EE_COLLECTION.format( arctic_or_antarctic='antarctic', orig_or_reproj='reproj') collections = [ arctic_collection_orig, arctic_collection_reproj, antarctic_collection_orig, antarctic_collection_reproj ] if CLEAR_COLLECTION_FIRST: for collection in collections: if eeUtil.exists(collection): eeUtil.removeAsset(collection, recursive=True) ### 3. Process arctic data arctic_data = collections[0:2] arctic_assets_orig = checkCreateCollection(arctic_data[0]) arctic_assets_reproj = checkCreateCollection(arctic_data[1]) arctic_dates_orig = [getRasterDate(a) for a in arctic_assets_orig] arctic_dates_reproj = [getRasterDate(a) for a in arctic_assets_reproj] new_arctic_assets_orig, new_arctic_assets_reproj = processNewRasterData( arctic_dates_reproj, 'arctic', new_or_hist='new') new_arctic_dates_orig = [getRasterDate(a) for a in new_arctic_assets_orig] new_arctic_dates_reproj = [ getRasterDate(a) for a in new_arctic_assets_reproj ] ### 4. Process antarctic data antarctic_data = collections[2:] antarctic_assets_orig = checkCreateCollection(antarctic_data[0]) antarctic_assets_reproj = checkCreateCollection(antarctic_data[1]) antarctic_dates_orig = [getRasterDate(a) for a in antarctic_assets_orig] antarctic_dates_reproj = [ getRasterDate(a) for a in antarctic_assets_reproj ] new_antarctic_assets_orig, new_antarctic_assets_reproj = processNewRasterData( antarctic_dates_reproj, 'antarctic', new_or_hist='new') new_antarctic_dates_orig = [ getRasterDate(a) for a in new_antarctic_assets_orig ] new_antarctic_dates_reproj = [ getRasterDate(a) for a in new_antarctic_assets_reproj ] ### 5. Delete old assets e_dates = [ arctic_dates_orig, arctic_dates_reproj, antarctic_dates_orig, antarctic_dates_reproj ] n_dates = [ new_arctic_dates_orig, new_arctic_dates_reproj, new_antarctic_dates_orig, new_antarctic_dates_reproj ] for i in range(4): orig_or_reproj = 'orig' if i % 2 == 0 else 'reproj' arctic_or_antarctic = 'arctic' if i < 2 else 'antarctic' e = e_dates[i] n = n_dates[i] total = e + n logging.info('Existing {} {} assets: {}, new: {}, max: {}'.format( orig_or_reproj, arctic_or_antarctic, len(e), len(n), MAX_DATES)) deleteExcessAssets(total, orig_or_reproj, arctic_or_antarctic, MAX_DATES, 'new') ### for dataset, id in DATASET_ID.items(): # Get most recent update date most_recent_date = get_most_recent_date(dataset) current_date = getLastUpdate(id) if current_date != most_recent_date: logging.info('Updating last update date and flushing cache.') # Update data set's last update date on Resource Watch lastUpdateDate(id, most_recent_date) # get layer ids and flush tile cache for each layer_ids = getLayerIDs(id) for layer_id in layer_ids: flushTileCache(layer_id) ## Process historical data if COLLECT_BACK_HISTORY == True: for month in HISTORICAL_MONTHS: logging.info( 'Processing historical data for month {}'.format(month)) ### 2. Create collection names, clear if desired arctic_collection_orig = EE_COLLECTION_BY_MONTH.format( arctic_or_antarctic='arctic', orig_or_reproj='orig', month="{:02d}".format(month)) arctic_collection_reproj = EE_COLLECTION_BY_MONTH.format( arctic_or_antarctic='arctic', orig_or_reproj='reproj', month="{:02d}".format(month)) antarctic_collection_orig = EE_COLLECTION_BY_MONTH.format( arctic_or_antarctic='antarctic', orig_or_reproj='orig', month="{:02d}".format(month)) antarctic_collection_reproj = EE_COLLECTION_BY_MONTH.format( arctic_or_antarctic='antarctic', orig_or_reproj='reproj', month="{:02d}".format(month)) collections = [ arctic_collection_orig, arctic_collection_reproj, antarctic_collection_orig, antarctic_collection_reproj ] ### 3. Process arctic data arctic_data = collections[0:2] arctic_assets_orig = checkCreateCollection(arctic_data[0]) arctic_assets_reproj = checkCreateCollection(arctic_data[1]) arctic_dates_orig = [getRasterDate(a) for a in arctic_assets_orig] arctic_dates_reproj = [ getRasterDate(a) for a in arctic_assets_reproj ] new_arctic_assets_orig, new_arctic_assets_reproj = processNewRasterData( arctic_dates_orig, 'arctic', new_or_hist='hist', month=month) new_arctic_dates_orig = [ getRasterDate(a) for a in new_arctic_assets_orig ] new_arctic_dates_reproj = [ getRasterDate(a) for a in new_arctic_assets_reproj ] ### 4. Process antarctic data antarctic_data = collections[2:] antarctic_assets_orig = checkCreateCollection(antarctic_data[0]) antarctic_assets_reproj = checkCreateCollection(antarctic_data[1]) antarctic_dates_orig = [ getRasterDate(a) for a in antarctic_assets_orig ] antarctic_dates_reproj = [ getRasterDate(a) for a in antarctic_assets_reproj ] new_antarctic_assets_orig, new_antarctic_assets_reproj = processNewRasterData( antarctic_dates_orig, 'antarctic', new_or_hist='hist', month=month) new_antarctic_dates_orig = [ getRasterDate(a) for a in new_antarctic_assets_orig ] new_antarctic_dates_reproj = [ getRasterDate(a) for a in new_antarctic_assets_reproj ] ### 5. Delete old assets e_dates = [ arctic_dates_orig, arctic_dates_reproj, antarctic_dates_orig, antarctic_dates_reproj ] n_dates = [ new_arctic_dates_orig, new_arctic_dates_reproj, new_antarctic_dates_orig, new_antarctic_dates_reproj ] for i in range(4): orig_or_reproj = 'orig' if i % 2 == 0 else 'reproj' arctic_or_antarctic = 'arctic' if i < 2 else 'antarctic' e = e_dates[i] n = n_dates[i] total = e + n logging.info('Existing {} {} assets: {}, new: {}'.format( orig_or_reproj, arctic_or_antarctic, len(e), len(n))) #uncomment if we want to put a limit on how many years of historical data we have #deleteExcessAssets(total, orig_or_reproj, arctic_or_antarctic, MAX_DATES,'hist') ### for dataset, id in HIST_DATASET_ID.items(): # Get most recent update date most_recent_date = get_most_recent_date(dataset) lastUpdateDate(id, most_recent_date) logging.info('SUCCESS')
import eeUtil eeUtil.initJson() collection = 'cli_012_co2_concentrations' print(eeUtil.exists(f'test_{collection}')) eeUtil.createFolder(f'test_{collection}', True, public=True) print('hola holita!') print(eeUtil.exists(f'test_{collection}')) eeUtil.removeAsset(f'test_{collection}') print(eeUtil.exists(f'test_{collection}'))
def test_collectionExists(): assert eeUtil.exists(COLLECTION), "Response metadata incorrect" return