コード例 #1
0
def checkCreateCollection(VARS):
    existing_dates = []
    existing_dates_by_var = []
    for VAR in VARS:
        # For one of the variables, get the date of the most recent data set
        # All variables come from the same file
        # If we have one for a particular data, we should have them all
        collection = EE_COLLECTION_GEN.format(var=VAR)
        if not eeUtil.exists(PARENT_FOLDER):
            logging.info('{} does not exist, creating'.format(PARENT_FOLDER))
            eeUtil.createFolder(PARENT_FOLDER)
        if eeUtil.exists(collection):
            existing_assets = eeUtil.ls(collection)
            dates = [getDate_GEE(a) for a in existing_assets]
            existing_dates_by_var.append(dates)
            for date in dates:
                if date not in existing_dates:
                    existing_dates.append(date)

        else:
            existing_dates_by_var.append([])
            logging.info('{} does not exist, creating'.format(collection))
            eeUtil.createFolder(collection, True)
    existing_dates_all_vars = copy.copy(existing_dates)
    for date in existing_dates:
        count = sum(x.count(date)
                    for x in existing_dates_by_var) / len(TIME_HOURS)
        if count < len(VARS):
            existing_dates_all_vars.remove(date)
    return existing_dates_all_vars, existing_dates_by_var
コード例 #2
0
 def checkCreateCollection(collection):
     '''List assests in collection else create new collection'''
     if eeUtil.exists(collection):
         return eeUtil.ls(collection)
     else:
         logging.info('{} does not exist, creating'.format(collection))
         eeUtil.createFolder(collection, True, public=True)
         return []
コード例 #3
0
def get_forecast_run_date(var):
    '''
    Get the date that the most recent forecast was run from
    INPUT   var: variable for which we are pulling forecast run date (string)
    RETURN  most_recent_forecast_date: date of most recent forecast run (datetime)
    '''
    # pull existing assets in the collection
    collection = getCollectionName(var)
    existing_assets = eeUtil.ls(collection)
    # sort these dates oldest to newest
    existing_assets.sort()
    # get the forecast run date (first in the list) and turn it into a datetime
    most_recent_forecast_date = datetime.datetime.strptime(existing_assets[0][-13:], DATE_FORMAT)
    return most_recent_forecast_date
コード例 #4
0
def checkCreateCollection(collection):
    '''
    List assests in collection if it exists, else create new collection
    INPUT   collection: GEE collection to check or create (string)
    RETURN  list of assets in collection (list of strings)
    '''
    # if collection exists, return list of assets in collection
    if eeUtil.exists(collection):
        return eeUtil.ls(collection)
    # if collection does not exist, create it and return an empty list (because no assets are in the collection)
    else:
        logging.info('{} does not exist, creating'.format(collection))
        eeUtil.createFolder(collection, True, public=True)
        return []
コード例 #5
0
def checkCreateCollection(VARS):
    '''
    List assets in collection if it exists, else create new collection
    INPUT   VARS: list variables (as named in netcdf) that we want to check collections for (list of strings)
    RETURN  existing_dates_all_vars: list of dates, in the format of the DATE_FORMAT variable, that exist for all variable collections in GEE (list of strings)
            existing_dates_by_var: list of dates, in the format of the DATE_FORMAT variable, that exist for each individual variable collection in GEE (list containing list of strings for each variable)
    '''
    # create a master list (not variable-specific) to store the dates for which all variables already have data for
    existing_dates = []
    # create an empty list to store the dates that we currently have for each AQ variable
    # will be used in case the previous script run crashed before completing the data upload for every variable.
    existing_dates_by_var = []
    # loop through each variables that we want to pull
    for var in VARS:
        # For one of the variables, get the date of the most recent dataset
        # All variables come from the same file
        # If we have one for a particular data, we should have them all
        collection = getCollectionName(var)

        # Check if folder to store GEE collections exists. If not, create it.
        # we will make one collection per variable, all stored in the parent folder for the dataset
        if not eeUtil.exists(PARENT_FOLDER):
            logging.info('{} does not exist, creating'.format(PARENT_FOLDER))
            eeUtil.createFolder(PARENT_FOLDER)

        # If the GEE collection for a particular variable exists, get a list of existing assets
        if eeUtil.exists(collection):
            existing_assets = eeUtil.ls(collection)
            # get a list of the dates from these existing assets
            dates = [getDate_GEE(a) for a in existing_assets]
            # append this list of dates to our list of dates by variable
            existing_dates_by_var.append(dates)

            # for each of the dates that we have for this variable, append the date to the master
            # list of which dates we already have data for (if it isn't already in the list)
            for date in dates:
                if date not in existing_dates:
                    existing_dates.append(date)
        # If the GEE collection does not exist, append an empty list to our list of dates by variable
        else:
            existing_dates_by_var.append([])
            # create a collection for this variable
            logging.info('{} does not exist, creating'.format(collection))
            eeUtil.createFolder(collection, True)

    '''
     We want make sure all variables correctly uploaded the data on the last run. To do this, we will
     check that we have the correct number of appearances of the data in our GEE collection. If we do
     not, we will want to re-upload this date's data.
    '''
    # Create a copy of the master list of dates that will store the dates that were properly uploaded for all variables.
    existing_dates_all_vars = copy.copy(existing_dates)
    for date in existing_dates:
        # check how many times each date appears in our list of dates by variable
        date_count = sum(x.count(date) for x in existing_dates_by_var)
        # divide this count by the number of time intervals we have (because the date will be
        # repeated for each time)
        count = date_count / len(TIME_HOURS)
        # If this count is less than the number of variables we have, one of the variables did not finish
        # uploading for this date, and we need to re-upload this file.
        if count < len(VARS):
            # remove this from the list of existing dates for all variables
            existing_dates_all_vars.remove(date)
    return existing_dates_all_vars, existing_dates_by_var
コード例 #6
0
def main():
    global VAR
    global EE_COLLECTION
    global EE_COLLECTION_GEN
    global PARENT_FOLDER
    global FILENAME
    global GS_FOLDER
    PARENT_FOLDER = COLLECTION
    EE_COLLECTION_GEN = COLLECTION + '/{var}'
    FILENAME = COLLECTION[29:] + '_{var}_{date}'
    '''Ingest new data into EE and delete old data'''
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)
    logging.info('STARTING')
    # Initialize eeUtil and clear collection in GEE if desired
    eeUtil.initJson()
    initialize_ee()
    if CLEAR_COLLECTION_FIRST:
        clearCollection()
    # 1. Check if collection exists and create
    existing_dates, existing_dates_by_var = checkCreateCollection(VARS)
    # Determine which files to fetch
    all_new_dates = getNewDates(existing_dates)
    # if new data is available, clear the collection because we want to store the most
    # recent forecast, not the old forecast
    if all_new_dates:
        clearCollection()
    #container only big enough to hold 3 files at once, so break into groups to process
    new_date_groups = [
        all_new_dates[x:x + 3] for x in range(0, len(all_new_dates), 3)
    ]
    for new_dates in new_date_groups:
        # Fetch new files
        logging.info('Fetching files for {}'.format(new_dates))
        files = fetch(
            new_dates)  #get list of locations of netcdfs in docker container
        # get last date because this file only has one time output so we need to process it differently
        last_file = files[-1]
        for var_num in range(len(VARS)):
            # get variable name
            VAR = VARS[var_num]
            # specify GEE collection name and Google Cloud Storage folder names
            EE_COLLECTION = EE_COLLECTION_GEN.format(var=VAR)
            GS_FOLDER = COLLECTION[1:] + '_' + VAR
            existing_assets = eeUtil.ls(EE_COLLECTION)
            # 2. Fetch, process, stage, ingest, clean
            new_assets = processNewData(files, var_num, last_file)
            new_dates = [getDateTime(a) for a in new_assets]
            # 3. Delete old assets
            all_dates = existing_dates_by_var[var_num] + new_dates
            all_assets = np.sort(
                np.unique(existing_assets +
                          [os.path.split(asset)[1] for asset in new_assets]))
            logging.info('Existing assets for {}: {}, new: {}, max: {}'.format(
                VAR, len(all_dates), len(new_dates), MAX_ASSETS))
            deleteExcessAssets(all_assets, (MAX_ASSETS))
            logging.info('SUCCESS for {}'.format(VAR))
            if var_num == len(VARS) - 1:
                # Get most recent update date
                most_recent_date = get_most_recent_date(all_assets)
                lastUpdateDate(DATASET_ID, most_recent_date)

        # Delete local netcdf files
        if DELETE_LOCAL:
            logging.info('Cleaning local NETCDF files')
            for f in files:
                os.remove(f)
コード例 #7
0
def test_deleteCollection():
    myCollection = eeUtil.ls(COLLECTION)
    assert len(myCollection) > 0, "Response metadata incorrect"
    return
コード例 #8
0
def test_createCollection():
    eeUtil.createFolder(f'test_{COLLECTION}', True, public=True)

    assert len(
        eeUtil.ls(f'test_{COLLECTION}')) == 0, "Response metadata incorrect"
    return