def process_items(self, p_walk: list, df_collections: dict, df_tiles: dict) -> None: '''Worker task that iterate over p_walk list and processes the items.''' print_line() logger.info(f'process_items - `{len(p_walk)}` chunks have been received.') logger.info(f'process_items - p_walk first record: {p_walk[0]}') logger.info(f'process_items - p_walk last record: {p_walk[-1]}') # convert from dict to dataframe again df_collections = DataFrame.from_dict(df_collections) df_tiles = DataFrame.from_dict(df_tiles) # fill pandas NaN (None, NaN, etc.) with numpy NaN df_collections.fillna({'grid_ref_sys_id': NaN}, inplace=True) logger.info('process_items - df_collections:\n' f"{df_collections[['id', 'name', 'grid_ref_sys_id', 'metadata', 'is_public']]}\n") logger.info(f'process_items - df_tiles.head():\n{df_tiles.head()}\n') items_insert = [] errors_insert = [] for dir_path, metadata, assets in p_walk: # create INSERT clause based on item information __items_insert, __errors_insert = create_item_and_get_insert_clauses( dir_path, metadata, assets, df_collections, df_tiles ) items_insert += __items_insert errors_insert += __errors_insert # if there are INSERT clauses, then insert them in the database if items_insert: # logger.info(f'process_items - items_insert: {items_insert}\n') logger.info(f'process_items - there are `{len(items_insert)}` ' 'items to insert in the database.') # if there is INSERT clauses to insert in the database, # then create a database instance and insert them there db = DBFactory.factory() concanate_inserts = ' '.join(items_insert) # logger.info(f'concanate_inserts: \n{concanate_inserts}\n') logger.info('process_items - inserting items in the database...') db.execute(concanate_inserts, is_transaction=True) # if there are INSERT clauses, then insert them in the database if errors_insert: # logger.info(f'process_items - errors_insert: {errors_insert}\n') logger.info(f'process_items - there are `{len(errors_insert)}` ' 'warnings or errors to insert in the database.') # if there is INSERT clauses to insert in the database, # then create a database instance and insert them there db = PostgreSQLPublisherConnection() concanate_errors = ' '.join(errors_insert) # logger.info(f'concanate_errors: \n{concanate_errors}\n') logger.info('process_items - inserting task errors in the database...') db.execute(concanate_errors, is_transaction=True)
def save_the_errors_in_the_database(self): # if there are INSERT clauses, then insert them in the database if self.errors_insert: # if there is INSERT clauses to insert in the database, # then create a database instance and insert them there db = PostgreSQLPublisherConnection() concanate_errors = ' '.join(self.errors_insert) # logger.info(f'concanate_errors: \n{concanate_errors}\n') logger.info('Inserting PublisherWalk.errors into database...') db.execute(concanate_errors, is_transaction=True)
def check_scene_dir(scene_dir): try: _, sensor_dir, date_dir, time_dir = decode_scene_dir(scene_dir) except CDSRDecoderException as error: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': error, 'metadata': {'folder': dir_path, 'method': 'check_scene_dir'}, 'type': 'error' }) ) return None # if scene_dir does not have the selected sensor, then not return it if sensor_dir != self.query['sensor']: return None # convert date from str to datetime date = datetime.strptime(date_dir, '%Y-%m-%d') # if time dir is between 0h and 5h, then consider it one day ago, # because date is reception date and not viewing date if time_dir >= '00:00:00' and time_dir <= '05:00:00': # subtract one day from the date date -= timedelta(days=1) # if scene_dir is not inside the selected date range, then not return it if not (date >= self.query['start_date'] and date <= self.query['end_date']): return None return scene_dir
def check_path_row_dir(path_row_dir): try: path, row = decode_path_row_dir(path_row_dir) except CDSRDecoderException as error: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': error, 'metadata': {'folder': dir_path, 'method': 'check_path_row_dir'}, 'type': 'error' }) ) return None if self.query['path'] is not None and self.query['path'] != int(path): return None if self.query['row'] is not None and self.query['row'] != int(row): return None return path_row_dir
def __generator(self): '''Generator that returns just directories with valid files.''' # logger.info('PublisherWalk\n') # `base_path` example: /TIFF/CBERS2B/ base_path = f'{self.BASE_DIR}/{self.query["satellite"]}' # logger.info(f'PublisherWalk - self.query: {self.query}') for dir_path, dirs, files in walk(base_path, followlinks=True): # get dir path starting at `/TIFF` index = dir_path.find('TIFF') # `splitted_dir_path` example: # ['TIFF', 'CBERS4A', '2020_11', 'CBERS_4A_WFI_RAW_2020_11_10.13_41_00_ETC2', # '207_148_0', '2_BC_UTM_WGS84'] splitted_dir_path = dir_path[index:].split(os_path_sep) dir_level = len(splitted_dir_path) # get just the valid dirs and replace old ones with them dirs[:] = self.__filter_dir(dir_level, dir_path, dirs) # if I'm not inside a geo processing dir, then ignore this folder if dir_level != 6: continue # if the dir does not have any file, then report and ignore this folder if not files: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': 'This folder is valid, but it is empty.', 'metadata': {'folder': dir_path}, 'type': 'warning' }) ) continue # if there are not enough metadata, then ignore this folder metadata = decode_path(dir_path) if not metadata: continue assets = {} for radio_processing in self.query['radio_processing']: # if user is publishing `SR` files, but there is not any # `SR` files in this folder, then ignore it if radio_processing == 'SR' and not is_there_sr_files_in_the_list_of_files(files): continue assets_metadata = self.satellite_metadata.get_assets_metadata( metadata['satellite'], metadata['sensor'], radio_processing ) # if there is not a valid asset, then ignore it __assets = self.__create_assets_from_metadata(assets_metadata, dir_path, metadata) if not __assets: continue assets[radio_processing] = __assets # if there is not one asset at least, then ignore this folder if not assets: continue # yield just valid directories yield dir_path, metadata, assets
def __filter_dir(self, dir_level, dir_path, dirs): '''Filter `dirs` parameter based on the directory level.''' # check the year_month dirs if dir_level == 2: # I'm inside satellite folder, then the dirs are year-month folders # return just the year_month dirs that are between the date range # `start_date` and `end_date` fields are required # example: 2019_01 start_year_month = (f"{self.query['start_date'].year}_" f"{fill_string_with_left_zeros(str(self.query['start_date'].month), 2)}") # example: 2020_12 end_year_month = (f"{self.query['end_date'].year}_" f"{fill_string_with_left_zeros(str(self.query['end_date'].month), 2)}") return [d for d in dirs if d >= start_year_month and d <= end_year_month] # check the scene dirs elif dir_level == 3: # I'm inside year-month folder, then the dirs are scene folders # return just the scene dirs that have the selected sensor # if the option is None, then return the original dirs if self.query['sensor'] is None: return dirs def check_scene_dir(scene_dir): try: _, sensor_dir, date_dir, time_dir = decode_scene_dir(scene_dir) except CDSRDecoderException as error: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': error, 'metadata': {'folder': dir_path, 'method': 'check_scene_dir'}, 'type': 'error' }) ) return None # if scene_dir does not have the selected sensor, then not return it if sensor_dir != self.query['sensor']: return None # convert date from str to datetime date = datetime.strptime(date_dir, '%Y-%m-%d') # if time dir is between 0h and 5h, then consider it one day ago, # because date is reception date and not viewing date if time_dir >= '00:00:00' and time_dir <= '05:00:00': # subtract one day from the date date -= timedelta(days=1) # if scene_dir is not inside the selected date range, then not return it if not (date >= self.query['start_date'] and date <= self.query['end_date']): return None return scene_dir return list(filter(check_scene_dir, dirs)) # check the path/row dirs elif dir_level == 4: # I'm inside sensor folder, then the dirs are path/row folders def check_path_row_dir(path_row_dir): try: path, row = decode_path_row_dir(path_row_dir) except CDSRDecoderException as error: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': error, 'metadata': {'folder': dir_path, 'method': 'check_path_row_dir'}, 'type': 'error' }) ) return None if self.query['path'] is not None and self.query['path'] != int(path): return None if self.query['row'] is not None and self.query['row'] != int(row): return None return path_row_dir return list(filter(check_path_row_dir, dirs)) # check the geo processing dirs elif dir_level == 5: # I'm inside path/row folder, then the dirs are geo processing folders # lambda function to check if the directory starts with any selected geo processing check_if_dir_startswith_any_gp = lambda directory: any( directory.startswith(gp) for gp in self.query['geo_processing'] ) # if the level_dir does not start with the informed geo_processing, then the folder is invalid # `d` example: `2_BC_UTM_WGS84` return [d for d in dirs if check_if_dir_startswith_any_gp(d)] # check files existence elif dir_level == 6: # I'm inside geo processing folder, then should not have dirs inside here if dirs: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': 'There are folders inside a geo processing directory.', 'metadata': {'folder': dir_path}, 'type': 'warning' }) ) return dirs self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': f'Invalid `{dir_level}` directory level.', 'metadata': {'folder': dir_path}, 'type': 'warning' }) ) return dirs
def __create_assets_from_metadata(self, assets_matadata, dir_path, metadata): '''Create assets object based on assets metadata.''' # search for all files that end with `*.png` png_files = glob(f'{dir_path}/*.png') if not png_files: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': 'There is NOT a quicklook in this folder, then it will be ignored.', 'metadata': {'folder': dir_path}, 'type': 'error' }) ) return None # initialize `assets` object with the `thumbnail` key assets = { 'thumbnail': { 'href': png_files[0], 'type': 'image/png', 'roles': ['thumbnail'] } } # if this folder is WFI/L4, then this folder must contain `*h5_*.json` files if metadata['geo_processing'] == '4' and \ (metadata['sensor'] == 'WFI' or metadata['sensor'] == 'AWFI'): # search for all files that end with `*h5_*.json` l4_json_files = sorted(glob(f'{dir_path}/*.h5_*.json')) if not l4_json_files: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': 'There is NOT a L4 JSON file (i.e. `*h5_*.json`) in this folder, ' 'then it will be ignored.', 'metadata': {'folder': dir_path}, 'type': 'error' }) ) return None # # if there are L4 JSON files, then add them to the assets dict for l4_json_file in l4_json_files: # l4_json_file example: # '/TIFF/CBERS4A/2020_11/.../4_BC_UTM_WGS84/CBERS_4A_WFI_20201122_217_156.h5_0.json' # first get the file name, then get the `h5_N` part from the file name # the asset name should be something like `h5_N_json` # (e.g. either h5_0_json or h5_1_json) asset_name = f"{l4_json_file.split('/')[-1].split('.')[1]}_json" assets[asset_name] = { 'href': l4_json_file, 'type': 'application/json', 'roles': ['metadata'] } for band, band_template in assets_matadata.items(): # search for all TIFF files based on a template with `band_template` # for example: search all TIFF files that matches with '/folder/*BAND6.tif' tiff_files = sorted(glob(f'{dir_path}/*{band_template}')) if not tiff_files: # EVI and NDVI files are optional, then if they do not exist, do not report them if band == 'evi' or band == 'ndvi': continue self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': ('There is NOT a TIFF file in this folder that ends with the ' f'`{band_template}` template, then it will be ignored.'), 'metadata': {'folder': dir_path}, 'type': 'error' }) ) return None # get just the band name from the template (e.g. `BAND6`) band_name = band_template.replace('.tif', '') # add TIFF file as an asset assets[band_name] = { 'href': tiff_files[0], 'type': 'image/tiff; application=geotiff', 'common_name': band, 'roles': ['data'] } # quality, evi and ndvi TIFF files have not XML files if band == 'quality' or band == 'evi' or band == 'ndvi': # `quality` band contains a JSON file if band == 'quality': # search for all JSON files based on a template with `band_template` # for example: search all JSON files that matches with '/folder/*BAND6.json' json_files = sorted(glob(f"{dir_path}/*{band_template.replace('.tif', '.json')}")) if not json_files: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': ('There is NOT a JSON file in this folder that ends with the ' f"`{band_template.replace('.tif', '.json')}` template, " 'then it will be ignored.'), 'metadata': {'folder': dir_path}, 'type': 'error' }) ) return None # add JSON file as an asset assets[band_name + '_json'] = { 'href': json_files[0], 'type': 'application/json', 'roles': ['metadata'] } continue # search for all TIFF files based on a template with `band_template` # for example: search all TIFF files that matches with '/folder/*BAND6.xml' xml_files = sorted(glob(f"{dir_path}/*{band_template.replace('.tif', '.xml')}")) if not xml_files: self.errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': ('There is NOT an XML file in this folder that ends with the ' f"`{band_template.replace('.tif', '.xml')}` template, " 'then it will be ignored.'), 'metadata': {'folder': dir_path}, 'type': 'error' }) ) return None # add XML file as an asset assets[band_name + '_xml'] = { 'href': xml_files[0], 'type': 'application/xml', 'roles': ['metadata'] } return assets
def create_item_and_get_insert_clauses(dir_path, metadata, assets, df_collections, df_tiles): print_line() items_insert = [] errors_insert = [] logger.info(f'dir_path: {dir_path}') logger.info(f'metadata: {metadata}') # logger.info(f'assets: {assets}') # `items` is a list of items (e.g. [dn_item, sr_item]) items = create_items(metadata, assets) # logger.info(f'items size: {len(items)}\n') for item in items: print_line() # logger.info(f'item: {item}\n') logger.info(f"item[properties]: {item['properties']}") logger.info(f"item[collection]: {item['collection']}") # get collection id from dataframe collection = df_collections.loc[ df_collections['name'] == item['collection']['name'] ].reset_index(drop=True) # logger.info('collection: \n' # f"{collection[['id', 'name', 'grid_ref_sys_id', 'metadata', 'is_public']]}\n") # if `collection` is an empty dataframe, a collection was not found by its name, # then save the warning and ignore it if len(collection.index) == 0: # create a substring to check if this message has already been added to the list sub_message = f"There is metadata to the `{item['collection']['name']}` collection" # check if the collection has not already been added to the errors list. # prevent inserting the same message twice if not any(sub_message in error_insert for error_insert in errors_insert): errors_insert.append( PostgreSQLPublisherConnection.create_task_error_insert_clause({ 'message': ( f"There is metadata to the `{item['collection']['name']}` collection," ' however this collection does not exist in the database.' ), 'metadata': {'folder': dir_path}, 'type': 'error' }) ) continue collection_id = collection.at[0, 'id'] # logger.info(f'collection_id: {collection_id}') tile_id = get_tile_id_from_collection(collection, metadata, df_tiles) # create INSERT clause based on item metadata insert = PostgreSQLCatalogTestConnection.create_item_insert_clause( item, collection_id, tile_id ) # logger.info(f'insert: {insert}\n') logger.info(f"Adding an INSERT clause to `{item['properties']['name']}` " "item in the list...\n") items_insert.append(insert) return items_insert, errors_insert