def checkCreateCollection(VARS): existing_dates = [] existing_dates_by_var = [] for VAR in VARS: # For one of the variables, get the date of the most recent data set # All variables come from the same file # If we have one for a particular data, we should have them all collection = EE_COLLECTION_GEN.format(var=VAR) if not eeUtil.exists(PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder(PARENT_FOLDER) if eeUtil.exists(collection): existing_assets = eeUtil.ls(collection) dates = [getDate_GEE(a) for a in existing_assets] existing_dates_by_var.append(dates) for date in dates: if date not in existing_dates: existing_dates.append(date) else: existing_dates_by_var.append([]) logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True) existing_dates_all_vars = copy.copy(existing_dates) for date in existing_dates: count = sum(x.count(date) for x in existing_dates_by_var) / len(TIME_HOURS) if count < len(VARS): existing_dates_all_vars.remove(date) return existing_dates_all_vars, existing_dates_by_var
def checkCreateCollection(collection): '''List assests in collection else create new collection''' if eeUtil.exists(collection): return eeUtil.ls(collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return []
def get_forecast_run_date(var): ''' Get the date that the most recent forecast was run from INPUT var: variable for which we are pulling forecast run date (string) RETURN most_recent_forecast_date: date of most recent forecast run (datetime) ''' # pull existing assets in the collection collection = getCollectionName(var) existing_assets = eeUtil.ls(collection) # sort these dates oldest to newest existing_assets.sort() # get the forecast run date (first in the list) and turn it into a datetime most_recent_forecast_date = datetime.datetime.strptime(existing_assets[0][-13:], DATE_FORMAT) return most_recent_forecast_date
def checkCreateCollection(collection): ''' List assests in collection if it exists, else create new collection INPUT collection: GEE collection to check or create (string) RETURN list of assets in collection (list of strings) ''' # if collection exists, return list of assets in collection if eeUtil.exists(collection): return eeUtil.ls(collection) # if collection does not exist, create it and return an empty list (because no assets are in the collection) else: logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True, public=True) return []
def checkCreateCollection(VARS): ''' List assets in collection if it exists, else create new collection INPUT VARS: list variables (as named in netcdf) that we want to check collections for (list of strings) RETURN existing_dates_all_vars: list of dates, in the format of the DATE_FORMAT variable, that exist for all variable collections in GEE (list of strings) existing_dates_by_var: list of dates, in the format of the DATE_FORMAT variable, that exist for each individual variable collection in GEE (list containing list of strings for each variable) ''' # create a master list (not variable-specific) to store the dates for which all variables already have data for existing_dates = [] # create an empty list to store the dates that we currently have for each AQ variable # will be used in case the previous script run crashed before completing the data upload for every variable. existing_dates_by_var = [] # loop through each variables that we want to pull for var in VARS: # For one of the variables, get the date of the most recent dataset # All variables come from the same file # If we have one for a particular data, we should have them all collection = getCollectionName(var) # Check if folder to store GEE collections exists. If not, create it. # we will make one collection per variable, all stored in the parent folder for the dataset if not eeUtil.exists(PARENT_FOLDER): logging.info('{} does not exist, creating'.format(PARENT_FOLDER)) eeUtil.createFolder(PARENT_FOLDER) # If the GEE collection for a particular variable exists, get a list of existing assets if eeUtil.exists(collection): existing_assets = eeUtil.ls(collection) # get a list of the dates from these existing assets dates = [getDate_GEE(a) for a in existing_assets] # append this list of dates to our list of dates by variable existing_dates_by_var.append(dates) # for each of the dates that we have for this variable, append the date to the master # list of which dates we already have data for (if it isn't already in the list) for date in dates: if date not in existing_dates: existing_dates.append(date) # If the GEE collection does not exist, append an empty list to our list of dates by variable else: existing_dates_by_var.append([]) # create a collection for this variable logging.info('{} does not exist, creating'.format(collection)) eeUtil.createFolder(collection, True) ''' We want make sure all variables correctly uploaded the data on the last run. To do this, we will check that we have the correct number of appearances of the data in our GEE collection. If we do not, we will want to re-upload this date's data. ''' # Create a copy of the master list of dates that will store the dates that were properly uploaded for all variables. existing_dates_all_vars = copy.copy(existing_dates) for date in existing_dates: # check how many times each date appears in our list of dates by variable date_count = sum(x.count(date) for x in existing_dates_by_var) # divide this count by the number of time intervals we have (because the date will be # repeated for each time) count = date_count / len(TIME_HOURS) # If this count is less than the number of variables we have, one of the variables did not finish # uploading for this date, and we need to re-upload this file. if count < len(VARS): # remove this from the list of existing dates for all variables existing_dates_all_vars.remove(date) return existing_dates_all_vars, existing_dates_by_var
def main(): global VAR global EE_COLLECTION global EE_COLLECTION_GEN global PARENT_FOLDER global FILENAME global GS_FOLDER PARENT_FOLDER = COLLECTION EE_COLLECTION_GEN = COLLECTION + '/{var}' FILENAME = COLLECTION[29:] + '_{var}_{date}' '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') # Initialize eeUtil and clear collection in GEE if desired eeUtil.initJson() initialize_ee() if CLEAR_COLLECTION_FIRST: clearCollection() # 1. Check if collection exists and create existing_dates, existing_dates_by_var = checkCreateCollection(VARS) # Determine which files to fetch all_new_dates = getNewDates(existing_dates) # if new data is available, clear the collection because we want to store the most # recent forecast, not the old forecast if all_new_dates: clearCollection() #container only big enough to hold 3 files at once, so break into groups to process new_date_groups = [ all_new_dates[x:x + 3] for x in range(0, len(all_new_dates), 3) ] for new_dates in new_date_groups: # Fetch new files logging.info('Fetching files for {}'.format(new_dates)) files = fetch( new_dates) #get list of locations of netcdfs in docker container # get last date because this file only has one time output so we need to process it differently last_file = files[-1] for var_num in range(len(VARS)): # get variable name VAR = VARS[var_num] # specify GEE collection name and Google Cloud Storage folder names EE_COLLECTION = EE_COLLECTION_GEN.format(var=VAR) GS_FOLDER = COLLECTION[1:] + '_' + VAR existing_assets = eeUtil.ls(EE_COLLECTION) # 2. Fetch, process, stage, ingest, clean new_assets = processNewData(files, var_num, last_file) new_dates = [getDateTime(a) for a in new_assets] # 3. Delete old assets all_dates = existing_dates_by_var[var_num] + new_dates all_assets = np.sort( np.unique(existing_assets + [os.path.split(asset)[1] for asset in new_assets])) logging.info('Existing assets for {}: {}, new: {}, max: {}'.format( VAR, len(all_dates), len(new_dates), MAX_ASSETS)) deleteExcessAssets(all_assets, (MAX_ASSETS)) logging.info('SUCCESS for {}'.format(VAR)) if var_num == len(VARS) - 1: # Get most recent update date most_recent_date = get_most_recent_date(all_assets) lastUpdateDate(DATASET_ID, most_recent_date) # Delete local netcdf files if DELETE_LOCAL: logging.info('Cleaning local NETCDF files') for f in files: os.remove(f)
def test_deleteCollection(): myCollection = eeUtil.ls(COLLECTION) assert len(myCollection) > 0, "Response metadata incorrect" return
def test_createCollection(): eeUtil.createFolder(f'test_{COLLECTION}', True, public=True) assert len( eeUtil.ls(f'test_{COLLECTION}')) == 0, "Response metadata incorrect" return