def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore if len(resource['files']) >= 23: md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): timestamp = resource['name'].split(" - ")[1] out_fullday_netcdf = self.sensors.create_sensor_path(timestamp) out_fullday_csv = out_fullday_netcdf.replace(".nc", "_geo.csv") if file_exists(out_fullday_netcdf) and file_exists( out_fullday_csv): self.log_skip( resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) return CheckMessage.ignore return CheckMessage.download else: self.log_skip(resource, "found less than 23 files") return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download self.start_check(resource) if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check for a left and right BIN file - skip if not found if not contains_required_files(resource, ['_left.tif', '_right.tif']): self.log_skip(resource, "missing required files") return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): if not self.force: # Check NRMAC score > 15 before proceeding if available nrmac_md = get_extractor_metadata(md, "terra.stereo-rgb.nrmac") if not (nrmac_md and 'left_quality_score' in nrmac_md): self.log_skip(resource, "NRMAC quality score not available") return CheckMessage.ignore elif float(nrmac_md['left_quality_score']) > self.threshold: self.log_skip( resource, "NRMAC quality score %s is above threshold of %s" % (float( nrmac_md['left_quality_score']), self.threshold)) return CheckMessage.ignore if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist timestamp = resource['dataset_info']['name'].split(" - ")[1] left_enh_tiff = self.sensors.create_sensor_path(timestamp, opts=['left']) right_enh_tiff = self.sensors.create_sensor_path( timestamp, opts=['right']) if file_exists(left_enh_tiff) and file_exists(right_enh_tiff): if contains_required_files(resource, [ os.path.basename(left_enh_tiff), os.path.basename(right_enh_tiff) ]): self.log_skip( resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) return CheckMessage.ignore else: self.log_info( resource, "output files exist but not yet uploaded") # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download else: self.log_error(resource, "no terraref metadata found") return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check for an _ir.bin file before beginning processing if not contains_required_files(resource, ['_ir.bin']): self.log_skip(resource, "missing required files") return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist timestamp = resource['dataset_info']['name'].split(" - ")[1] tif = self.sensors.get_sensor_path(timestamp) png = tif.replace(".tif", ".png") if file_exists(png) and file_exists(tif): self.log_skip(resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) return CheckMessage.ignore # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download else: self.log_skip(resource, "no terraref metadata found") return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download self.start_check(resource) if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check for a left and right BIN file - skip if not found if not contains_required_files(resource, ['_left.bin', '_right.bin']): self.log_skip(resource, "missing required files") return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): if not self.overwrite and get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist timestamp = resource['dataset_info']['name'].split(" - ")[1] left_tiff = self.sensors.create_sensor_path(timestamp, opts=['left']) right_tiff = self.sensors.create_sensor_path(timestamp, opts=['right']) if file_exists(left_tiff) and file_exists(right_tiff): self.log_skip(resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download else: self.log_error(resource, "no terraref metadata found; sending to cleaner") submit_extraction(connector, host, secret_key, resource['id'], "terra.metadata.cleaner") return CheckMessage.ignore
def update_dataset_extractor_metadata(self, connector, host, key, dsid, metadata,\ extractor_name): """Adds or replaces existing dataset metadata for the specified extractor Args: connector(obj): the message queue connector instance host(str): the URI of the host making the connection key(str): used with the host API dsid(str): the dataset to update metadata(str): the metadata string to update the dataset with extractor_name(str): the name of the extractor to associate the metadata with """ meta = build_metadata(host, self.extractor_info, dsid, metadata, "dataset") try: md = ds.download_metadata(connector, host, key, dsid, extractor_name) md_len = len(md) except Exception as ex: # pylint: disable=broad-except md_len = 0 logging.debug(ex.message) if md_len > 0: ds.remove_metadata(connector, host, key, dsid, extractor_name) ds.upload_metadata(connector, host, key, dsid, meta)
def check_message(self, connector, host, secret_key, resource, parameters): # First, check if we have the correct sensor type md = download_metadata(connector, host, secret_key, resource['parent']['id']) ds_info = get_info(connector, host, secret_key, resource['parent']['id']) sensortype = self.determineSensorType(ds_info['name']) if sensortype in ["ndvi", "pri"]: if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: logging.info("skipping dataset %s, already processed" % resource['id']) return CheckMessage.ignore # Check if output already exists timestamp = ds_info['name'].split(" - ")[1] out_file = self.get_sensor_path(timestamp, opts=['extracted_values']) if os.path.isfile(out_file) and not self.overwrite: logging.info("skipping %s, outputs already exist" % resource['id']) return CheckMessage.ignore return CheckMessage.download else: return CheckMessage.ignore
def process_message_individual(self, connector, host, secret_key, resource, parameters): """This is deprecated method that operates on single capture, not field mosaic""" self.start_message() input_image = resource['local_paths'][0] # Create output in same directory as input, but check name ds_md = get_info(connector, host, secret_key, resource['parent']['id']) terra_md = get_terraref_metadata( download_metadata(connector, host, secret_key, resource['parent']['id']), 'stereoTop') dataset_name = ds_md['name'] timestamp = dataset_name.split(" - ")[1] # Is this left or right half? side = 'left' if resource['name'].find("_left") > -1 else 'right' gps_bounds = geojson_to_tuples( terra_md['spatial_metadata'][side]['bounding_box']) out_csv = self.sensors.create_sensor_path(timestamp, opts=[side], ext='csv') out_dgci = out_csv.replace(".csv", "_dgci.png") out_edge = out_csv.replace(".csv", "_edge.png") out_label = out_csv.replace(".csv", "_label.png") out_dgci_tif = out_dgci.replace('.png', '.tif') out_edge_tif = out_edge.replace('.png', '.tif') out_label_tif = out_label.replace('.png', '.tif') self.generate_all_outputs(input_image, out_csv, out_dgci, out_edge, out_label, gps_bounds) fileids = [] for file_to_upload in [ out_csv, out_dgci_tif, out_edge_tif, out_label_tif ]: if os.path.isfile(file_to_upload): if file_to_upload not in resource['local_paths']: # TODO: Should this be written to a separate dataset? #target_dsid = build_dataset_hierarchy(connector, host, secret_key, self.clowderspace, # self.sensors.get_display_name(), # timestamp[:4], timestamp[5:7], timestamp[8:10], leaf_ds_name=dataset_name) # Send output to Clowder source dataset fileids.append( upload_to_dataset(connector, host, secret_key, resource['parent']['id'], file_to_upload)) self.created += 1 self.bytes += os.path.getsize(file_to_upload) # Add metadata to original dataset indicating this was run ext_meta = build_metadata(host, self.extractor_info, resource['parent']['id'], {"files_created": fileids}, 'dataset') upload_metadata(connector, host, secret_key, resource['parent']['id'], ext_meta) self.end_message()
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore if not contains_required_files( resource, ['raw', 'raw.hdr', 'image.jpg', 'frameIndex.txt', 'settings.txt']): self.log_skip(resource, "missing required files") return CheckMessage.ignore if resource['dataset_info']['name'].find("SWIR") > -1: sensor_fullname = 'swir_netcdf' else: sensor_fullname = 'vnir_netcdf' timestamp = resource['dataset_info']['name'].split(" - ")[1] md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist out_nc = self.sensors.get_sensor_path(timestamp, sensor=sensor_fullname) if file_exists(out_nc): self.log_skip( resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) return CheckMessage.ignore # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download else: self.log_skip(resource, "no terraref metadata found") # See if we can recover it from disk if sensor_fullname == 'vnir_netcdf': date = timestamp.split("__")[0] source_dir = "/home/extractor/sites/ua-mac/raw_data/VNIR/%s/%s/" % ( date, timestamp) for f in os.listdir(source_dir): if f.endswith("_metadata.json"): self.log_info(resource, "updating metadata from %s" % f) raw_dsmd = load_json_file(os.path.join(source_dir, f)) clean_md = clean_metadata(raw_dsmd, 'VNIR') complete_md = build_metadata(host, self.extractor_info, resource['id'], clean_md, 'dataset') remove_metadata(connector, host, secret_key, resource['id']) upload_metadata(connector, host, secret_key, resource['id'], complete_md) return CheckMessage.download return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download self.start_check(resource) if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check for a left and right BIN file - skip if not found if not contains_required_files(resource, ['_left.tif', '_right.tif']): self.log_skip(resource, "missing required files") # Check for raw_data_source in metadata and resumbit to bin2tif if available... md = download_metadata(connector, host, secret_key, resource['id']) terra_md = get_terraref_metadata(md) if 'raw_data_source' in terra_md: raw_id = str(terra_md['raw_data_source'].split("/")[-1]) self.log_info(resource, "submitting raw source %s to bin2tif" % raw_id) submit_extraction(connector, host, secret_key, raw_id, "terra.stereo-rgb.bin2tif") return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist timestamp = resource['dataset_info']['name'].split(" - ")[1] left_nrmac_tiff = self.sensors.create_sensor_path(timestamp, opts=['left']) right_nrmac_tiff = self.sensors.create_sensor_path(timestamp, opts=['right']) if (self.leftonly and file_exists(left_nrmac_tiff)) or ( not self.leftonly and file_exists(left_nrmac_tiff) and file_exists(right_nrmac_tiff)): if contains_required_files(resource, [os.path.basename(left_nrmac_tiff)]): self.log_skip(resource, "metadata v%s and outputs already exist" % self.extractor_info['version']) return CheckMessage.ignore else: self.log_info(resource, "output file exists but not yet uploaded") # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download else: self.log_skip(resource, "no terraref metadata found") return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if resource['name'].find('fullfield') > -1 and re.match("^.*\d+_ir_.*.tif", resource['name']): # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata(md, self.extractor_info['name']) and not self.overwrite: self.log_skip(resource,"metadata indicates it was already processed") return CheckMessage.ignore return CheckMessage.download else: self.log_skip(resource,"regex not matched for %s" % resource['name']) return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download self.start_check(resource) if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check for a left and right BIN file - skip if not found found_left = False found_right = False for f in resource['files']: if 'filename' in f: if f['filename'].endswith('_left.bin'): found_left = True elif f['filename'].endswith('_right.bin'): found_right = True if not (found_left and found_right): self.log_skip( resource, "found left: %s, right: %s" % (found_left, found_right)) return CheckMessage.ignore # Check if outputs already exist unless overwrite is forced - skip if found if not self.overwrite: timestamp = resource['dataset_info']['name'].split(" - ")[1] lbase = self.sensors.get_sensor_path(timestamp, opts=['left'], ext='') rbase = self.sensors.get_sensor_path(timestamp, opts=['right'], ext='') out_dir = os.path.dirname(lbase) if (os.path.isfile(lbase + 'tif') and os.path.isfile(rbase + 'tif')): self.log_skip(resource, "outputs found in %s" % out_dir) return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: self.log_skip("metadata indicates it was already processed") return CheckMessage.ignore if get_terraref_metadata(md): return CheckMessage.download else: self.log_skip("no terraref metadata found") return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if not is_latest_file(resource): return CheckMessage.ignore # Check for expected input files before beginning processing if len(get_all_files(resource)) >= 23: md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): self.log_skip(resource, "metadata v%s already exists" % self.extractor_info['version']) return CheckMessage.ignore return CheckMessage.download else: self.log_skip(resource, 'not all input files are ready') return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if resource['name'].find('fullfield') > -1 and re.match( "^.*\d+_rgb_.*thumb.tif", resource['name']): # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['parent']['id']) if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: logging.info( "skipping dataset %s; metadata indicates it was already processed" % resource['id']) return CheckMessage.ignore return CheckMessage.download return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): # Check for 0000-0101 bin files before beginning processing if len(resource['files']) < 102: return CheckMessage.ignore if not is_latest_file(resource): return CheckMessage.ignore timestamp = resource['dataset_info']['name'].split(" - ")[1] hist_path = self.sensors.get_sensor_path(timestamp, opts=['combined_hist']) coloredImg_path = self.sensors.get_sensor_path( timestamp, opts=['combined_pseudocolored']) # Count number of bin files in dataset, as well as number of existing outputs ind_add = 0 ind_output = 0 for ind in range(0, 102): format_ind = "{0:0>4}".format(ind) # e.g. 1 becomes 0001 for f in resource['files']: if f['filename'].endswith(format_ind + '.bin'): ind_add += 1 out_png = self.sensors.get_sensor_path(timestamp, opts=[format_ind]) if os.path.exists(out_png) and not self.overwrite: ind_output += 1 break # Do the outputs already exist? if ind_output == 102 and os.path.exists(hist_path) and os.path.exists( coloredImg_path): logging.info("skipping dataset %s, outputs already exist" % resource['id']) return CheckMessage.ignore # Do we have too few input BIN files? if ind_add < 102: return CheckMessage.ignore md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: logging.info("skipping dataset %s, found existing metadata" % resource['id']) return CheckMessage.ignore if get_terraref_metadata(md): return CheckMessage.download return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if parameters["rulechecked"]: return CheckMessage.download if not is_latest_file(resource): return CheckMessage.ignore # Check for an ir.BIN file and metadata before beginning processing found_ir = None found_md = None for f in resource['files']: if 'filename' in f and f['filename'].endswith('_ir.bin'): found_ir = f['filepath'] elif 'filename' in f and f['filename'].endswith('_metadata.json'): found_md = f['filepath'] if found_ir: # Check if outputs already exist timestamp = resource['dataset_info']['name'].split(" - ")[1] png_path = self.sensors.get_sensor_path(timestamp, ext='png') tiff_path = self.sensors.get_sensor_path(timestamp) if os.path.exists(png_path) and os.path.exists( tiff_path) and not self.overwrite: logging.getLogger(__name__).info( "skipping dataset %s, outputs already exist" % resource['id']) return CheckMessage.ignore # If we don't find _metadata.json file, check if we have metadata attached to dataset instead if not found_md: md = download_metadata(connector, host, secret_key, resource['id']) if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: logging.getLogger(__name__).info( "skipping dataset %s, already processed" % resource['id']) return CheckMessage.ignore if get_terraref_metadata(md): return CheckMessage.download return CheckMessage.ignore else: return CheckMessage.download return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): #if not is_latest_file(resource): # return CheckMessage.ignore # Adjust sensor path based on VNIR vs SWIR if resource['dataset_info']['name'].find("SWIR") > -1: sensor_fullname = 'swir_netcdf' else: sensor_fullname = 'vnir_netcdf' if has_all_files(resource): # Check if output already exists timestamp = resource['dataset_info']['name'].split(" - ")[1] outFilePath = self.sensors.get_sensor_path(timestamp, sensor=sensor_fullname) if os.path.exists(outFilePath) and not self.overwrite: logging.getLogger(__name__).info( 'skipping dataset %s, output file already exists' % resource['id']) return CheckMessage.ignore else: # Check if we have necessary metadata, either as a .json file or attached to dataset md = download_metadata(connector, host, secret_key, resource['id'], self.extractor_info['name']) if get_extractor_metadata( md, self.extractor_info['name']) and not self.overwrite: logging.getLogger(__name__).info( "skipping dataset %s, already processed" % resource['id']) return CheckMessage.ignore elif get_terraref_metadata(md): return CheckMessage.download else: for f in resource['files']: if f['filename'] == 'metadata.json': return CheckMessage.download return CheckMessage.ignore else: logging.getLogger(__name__).info( 'skipping dataset %s, not all input files are ready' % resource['id']) return CheckMessage.ignore
def check_message(self, connector, host, secret_key, resource, parameters): if "rulechecked" in parameters and parameters["rulechecked"]: return CheckMessage.download self.start_check(resource) if not is_latest_file(resource): self.log_skip(resource, "not latest file") return CheckMessage.ignore # Check metadata to verify we have what we need md = download_metadata(connector, host, secret_key, resource['id']) if get_terraref_metadata(md): # Check for a left and right TIF file - skip if not found # If we're only processing the left files, don't check for the right file needed_files = ['_left.tif'] if not self.leftonly: needed_files.append('_right.tif') if not contains_required_files(resource, needed_files): self.log_skip(resource, "missing required files") return CheckMessage.ignore if get_extractor_metadata(md, self.extractor_info['name'], self.extractor_info['version']): # Make sure outputs properly exist timestamp = resource['dataset_info']['name'].split(" - ")[1] left_mask_tiff = self.sensors.create_sensor_path(timestamp, opts=['left']) right_mask_tiff = self.sensors.create_sensor_path(timestamp, opts=['right']) if (self.leftonly and file_exists(left_mask_tiff)) or \ (not (file_exists(left_mask_tiff) and file_exists(right_mask_tiff))): self.log_skip(resource, "metadata v%s and outputs already exist" % \ self.extractor_info['version']) return CheckMessage.ignore # Check for other images to create a mask on elif not contains_required_files(resource, ['.tif']): self.log_skip(resource, "missing required tiff file") return CheckMessage.ignore # Have TERRA-REF metadata, but not any from this extractor return CheckMessage.download
def check_message(self, connector, host, secret_key, resource, parameters): if resource['type'] != "dataset": if 'name' not in resource: resource['name'] = resource["type"] self.log_skip(resource, "position is only logged for dataset metadata") return CheckMessage.ignore self.start_check(resource) if 'spatial_metadata' in resource['metadata']: ds_md = download_metadata(connector, host, secret_key, resource['id']) ext_md = get_extractor_metadata(ds_md, self.extractor_info['name']) if not ext_md: return CheckMessage.bypass else: self.log_skip(resource, "sensorposition metadata already exists") return CheckMessage.ignore else: self.log_skip(resource, "newly added metadata is not from LemnaTec") return CheckMessage.ignore
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) # Get left/right files and metadata img_left, img_right, terra_md_full = None, None, None for fname in resource['local_paths']: if fname.endswith('_dataset_metadata.json'): all_dsmd = load_json_file(fname) terra_md_full = get_terraref_metadata(all_dsmd, 'stereoTop') elif fname.endswith('_left.bin'): img_left = fname elif fname.endswith('_right.bin'): img_right = fname if None in [img_left, img_right, terra_md_full]: raise ValueError("could not locate all files & metadata in processing") timestamp = resource['dataset_info']['name'].split(" - ")[1] # Fetch experiment name from terra metadata season_name, experiment_name, updated_experiment = get_season_and_experiment(timestamp, 'stereoTop', terra_md_full) if None in [season_name, experiment_name]: raise ValueError("season and experiment could not be determined") # Determine output directory self.log_info(resource, "Hierarchy: %s / %s / %s / %s / %s / %s / %s" % (season_name, experiment_name, self.sensors.get_display_name(), timestamp[:4], timestamp[5:7], timestamp[8:10], timestamp)) target_dsid = build_dataset_hierarchy_crawl(host, secret_key, self.clowder_user, self.clowder_pass, self.clowderspace, season_name, experiment_name, self.sensors.get_display_name(), timestamp[:4], timestamp[5:7], timestamp[8:10], leaf_ds_name=self.sensors.get_display_name() + ' - ' + timestamp) left_tiff = self.sensors.create_sensor_path(timestamp, opts=['left']) right_tiff = self.sensors.create_sensor_path(timestamp, opts=['right']) uploaded_file_ids = [] # Attach LemnaTec source metadata to Level_1 product if necessary target_md = download_metadata(connector, host, secret_key, target_dsid) if not get_extractor_metadata(target_md, self.extractor_info['name']): self.log_info(resource, "uploading LemnaTec metadata to ds [%s]" % target_dsid) remove_metadata(connector, host, secret_key, target_dsid, self.extractor_info['name']) terra_md_trim = get_terraref_metadata(all_dsmd) if updated_experiment is not None: terra_md_trim['experiment_metadata'] = updated_experiment terra_md_trim['raw_data_source'] = host + ("" if host.endswith("/") else "/") + "datasets/" + resource['id'] level1_md = build_metadata(host, self.extractor_info, target_dsid, terra_md_trim, 'dataset') upload_metadata(connector, host, secret_key, target_dsid, level1_md) try: left_shape = terraref.stereo_rgb.get_image_shape(terra_md_full, 'left') gps_bounds_left = geojson_to_tuples(terra_md_full['spatial_metadata']['left']['bounding_box']) right_shape = terraref.stereo_rgb.get_image_shape(terra_md_full, 'right') gps_bounds_right = geojson_to_tuples(terra_md_full['spatial_metadata']['right']['bounding_box']) except KeyError: self.log_error(resource, "spatial metadata not properly identified; sending to cleaner") submit_extraction(connector, host, secret_key, resource['id'], "terra.metadata.cleaner") return if (not file_exists(left_tiff)) or self.overwrite: # Perform actual processing self.log_info(resource, "creating %s" % left_tiff) left_image = terraref.stereo_rgb.process_raw(left_shape, img_left, None) create_geotiff(left_image, gps_bounds_left, left_tiff, None, True, self.extractor_info, terra_md_full, compress=True) self.created += 1 self.bytes += os.path.getsize(left_tiff) # Check if the file should be uploaded, even if it was already created found_in_dest = check_file_in_dataset(connector, host, secret_key, target_dsid, left_tiff) if not found_in_dest: self.log_info(resource, "uploading %s" % left_tiff) fileid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, target_dsid, left_tiff) uploaded_file_ids.append(host + ("" if host.endswith("/") else "/") + "files/" + fileid) if (not file_exists(right_tiff)) or self.overwrite: # Perform actual processing self.log_info(resource, "creating %s" % right_tiff) right_image = terraref.stereo_rgb.process_raw(right_shape, img_right, None) create_geotiff(right_image, gps_bounds_right, right_tiff, None, True, self.extractor_info, terra_md_full, compress=True) self.created += 1 self.bytes += os.path.getsize(right_tiff) # Check if the file should be uploaded, even if it was already created found_in_dest = check_file_in_dataset(connector, host, secret_key, target_dsid, right_tiff) if not found_in_dest: self.log_info(resource, "uploading %s" % right_tiff) fileid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, target_dsid, right_tiff) uploaded_file_ids.append(host + ("" if host.endswith("/") else "/") + "files/" + fileid) # Trigger additional extractors self.log_info(resource, "triggering downstream extractors") submit_extraction(connector, host, secret_key, target_dsid, "terra.stereo-rgb.rgbmask") submit_extraction(connector, host, secret_key, target_dsid, "terra.stereo-rgb.nrmac") submit_extraction(connector, host, secret_key, target_dsid, "terra.plotclipper_tif") # Tell Clowder this is completed so subsequent file updates don't daisy-chain if len(uploaded_file_ids) > 0: extractor_md = build_metadata(host, self.extractor_info, target_dsid, { "files_created": uploaded_file_ids }, 'dataset') self.log_info(resource, "uploading extractor metadata to raw dataset") remove_metadata(connector, host, secret_key, resource['id'], self.extractor_info['name']) try: upload_metadata(connector, host, secret_key, resource['id'], extractor_md) except: self.log_info(resource, "problem uploading extractor metadata...") self.end_message(resource)
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) sensor_type, timestamp = resource['name'].split(" - ") # First, re-check metadata to verify it hasn't been added in meantime ds_md = download_metadata(connector, host, secret_key, resource['id']) terra_md = get_terraref_metadata(ds_md) if terra_md: self.log_info(resource, "Found TERRA-REF metadata; not cleaning") return # These datasets do not have TERRA md uncleanables = ["Full Field"] if sensor_type in uncleanables: self.log_info(resource, "Cannot clean metadata for %s" % sensor_type) return # For these datasets, we must get TERRA md from raw_data source lv1_types = {"RGB GeoTIFFs": "stereoTop", "Thermal IR GeoTIFFs": "flirIrCamera"} if sensor_type in lv1_types: raw_equiv = resource['name'].replace(sensor_type, lv1_types[sensor_type]) source_dir = os.path.dirname(self.sensors.get_sensor_path_by_dataset(raw_equiv)) else: # Search for metadata.json source file source_dir = os.path.dirname(self.sensors.get_sensor_path_by_dataset(resource['name'])) source_dir = self.remapMountPath(connector, source_dir) if self.delete: # Delete all existing metadata from this dataset self.log_info(resource, "Deleting existing metadata") delete_dataset_metadata(host, self.clowder_user, self.clowder_pass, resource['id']) # TODO: split between the PLY files (in Level_1) and metadata.json files - unique to this sensor if sensor_type == "scanner3DTop": source_dir = source_dir.replace("Level_1", "raw_data") self.log_info(resource, "Searching for metadata.json in %s" % source_dir) if os.path.isdir(source_dir): md_file = None for f in os.listdir(source_dir): if f.endswith("metadata.json"): md_file = os.path.join(source_dir, f) if md_file: self.log_info(resource, "Found metadata.json; cleaning") md_json = clean_metadata(load_json_file(md_file), sensor_type) format_md = { "@context": ["https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld", {"@vocab": "https://terraref.ncsa.illinois.edu/metadata/uamac#"}], "content": md_json, "agent": { "@type": "cat:user", "user_id": "https://terraref.ncsa.illinois.edu/clowder/api/users/%s" % self.userid } } self.log_info(resource, "Uploading cleaned metadata") upload_metadata(connector, host, secret_key, resource['id'], format_md) # Now trigger a callback extraction if given if len(self.callback) > 0: self.log_info(resource, "Submitting callback extraction to %s" % self.callback) submit_extraction(connector, host, secret_key, resource['id'], self.callback) else: callbacks = self.get_callbacks_by_sensor(sensor_type) if callbacks: for c in callbacks: self.log_info(resource, "Submitting callback extraction to %s" % c) submit_extraction(connector, host, secret_key, resource['id'], c) else: self.log_info(resource, "No default callback found for %s" % sensor_type) else: self.log_error(resource, "metadata.json not found in %s" % source_dir) else: self.log_error(resource, "%s could not be found" % source_dir) # TODO: Have extractor check for existence of Level_1 output product and delete if exists? self.end_message(resource)
def notifyClowderOfCompletedTask(task): # Verify that globus user has a mapping to clowder credentials in config file globUser = task['user'] userMap = config['clowder']['user_map'] if globUser in userMap: logger.info("%s task complete; notifying Clowder" % task['globus_id'], extra={ "globus_id": task['globus_id'], "action": "NOTIFYING CLOWDER OF COMPLETION" }) clowder_host = config['clowder']['host'] clowder_key = config['clowder']['secret_key'] clowder_user = userMap[globUser]['clowder_user'] clowder_pass = userMap[globUser]['clowder_pass'] clowder_id = userMap[globUser]['clowder_id'] clowder_context = userMap[globUser]['context'] sess = requests.Session() sess.auth = (clowder_user, clowder_pass) # Response can be OK, RETRY or ERROR response = "OK" # Prepare upload object with all file(s) found updatedTask = safeCopy(task) space_id = task['contents']['space_id'] if 'space_id' in task[ 'contents'] else config['clowder']['primary_space'] for ds in task['contents']: # Skip any unexpected files at root level, e.g. # /home/clowder/sites/ua-mac/raw_data/GetFluorescenceValues.m # /home/clowder/sites/ua-mac/raw_data/irrigation/2017-06-04/@Recycle/flowmetertotals_March-2017.csv", if ds in ["LemnaTec - MovingSensor"] or ds.find("@Recycle") > -1: continue filesQueued = [] fileFormData = [] datasetMD = None datasetMDFile = False lastFile = None lastFileKey = None sensorname = ds.split(" - ")[0] logger.info("%s -- Processing [%s]" % (task['globus_id'], ds)) # Assign dataset-level metadata if provided if "md" in task['contents'][ds]: datasetMD = task['contents'][ds]['md'] # Add local files to dataset by path if 'files' in task['contents'][ds]: for fkey in task['contents'][ds]['files']: fobj = task['contents'][ds]['files'][fkey] if 'clowder_id' not in fobj or fobj['clowder_id'] == "": if os.path.exists(fobj['path']): if fobj['name'].find("metadata.json") == -1: if 'md' in fobj: # Use [1,-1] to avoid json.dumps wrapping quotes # Replace \" with " to avoid json.dumps escaping quotes mdstr = ', "md":' + json.dumps( fobj['md'])[1:-1].replace('\\"', '"') else: mdstr = "" filesQueued.append((fobj['path'], mdstr)) lastFile = fobj['name'] lastFileKey = fkey else: try: datasetMD = loadJsonFile(fobj['path']) datasetMDFile = fkey except: logger.error( "[%s] could not decode JSON from %s" % (ds, fobj['path'])) updatedTask['contents'][ds]['files'][fkey][ 'clowder_id'] = "FILE NOT FOUND" updatedTask['contents'][ds]['files'][fkey][ 'error'] = "Failed to load JSON" writeTaskToDatabase(updatedTask) if response == "OK": response = "ERROR" # Don't overwrite a RETRY else: logger.error("[%s] file not found: %s" % (ds, fobj['path'])) updatedTask['contents'][ds]['files'][fkey][ 'clowder_id'] = "FILE NOT FOUND" updatedTask['contents'][ds]['files'][fkey][ 'error'] = "File not found" writeTaskToDatabase(updatedTask) if response == "OK": response = "ERROR" # Don't overwrite a RETRY if len(filesQueued) > 0 or datasetMD: # Try to clean metadata first if datasetMD: cleaned_dsmd = None try: cleaned_dsmd = clean_metadata(datasetMD, sensorname) except Exception as e: logger.error("[%s] could not clean md: %s" % (ds, str(e))) task['contents'][ds][ 'error'] = "Could not clean metadata: %s" % str(e) # TODO: possible this could be recoverable with more info from clean_metadata if response == "OK": response = "ERROR" # Don't overwrite a RETRY if ds.find(" - ") > -1: # e.g. "co2Sensor - 2016-12-25" or "VNIR - 2016-12-25__12-32-42-123" c_sensor = ds.split(" - ")[0] c_date = ds.split(" - ")[1] c_year = c_date.split('-')[0] c_month = c_date.split('-')[1] if c_date.find("__") == -1: # If we only have a date and not a timestamp, don't create date collection c_date = None else: c_date = c_date.split("__")[0].split("-")[2] else: c_sensor, c_date, c_year, c_month = ds, None, None, None # Get dataset from clowder, or create & associate with collections try: hierarchy_host = clowder_host + ( "/" if not clowder_host.endswith("/") else "") dsid = build_dataset_hierarchy(hierarchy_host, clowder_key, clowder_user, clowder_pass, space_id, c_sensor, c_year, c_month, c_date, ds) logger.info(" [%s] id: %s" % (ds, dsid)) except Exception as e: logger.error("[%s] could not build hierarchy: %s" % (ds, str(e))) task['contents'][ds][ 'retry'] = "Could not build dataset hierarchy: %s" % str( e) response = "RETRY" continue if dsid: dsFileList = fetchDatasetFileList(dsid, sess) # Only send files not already present in dataset by path for queued in filesQueued: alreadyStored = False for storedFile in dsFileList: if queued[0] == storedFile['filepath']: logger.info( " skipping file %s (already uploaded)" % queued[0]) alreadyStored = True break if not alreadyStored: fileFormData.append( ("file", '{"path":"%s"%s}' % (queued[0], queued[1]))) if datasetMD and cleaned_dsmd: # Check for existing metadata from the site user alreadyAttached = False md_existing = download_metadata( None, hierarchy_host, clowder_key, dsid) for mdobj in md_existing: if 'agent' in mdobj and 'user_id' in mdobj['agent']: if mdobj['agent'][ 'user_id'] == "https://terraref.ncsa.illinois.edu/clowder/api/users/%s" % clowder_id: logger.info( " skipping metadata (already attached)" ) alreadyAttached = True break if not alreadyAttached: md = { "@context": [ "https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld", { "@vocab": clowder_context } ], "content": cleaned_dsmd, "agent": { "@type": "cat:user", "user_id": "https://terraref.ncsa.illinois.edu/clowder/api/users/%s" % clowder_id } } dsmd = sess.post( clowder_host + "/api/datasets/" + dsid + "/metadata.jsonld", headers={'Content-Type': 'application/json'}, data=json.dumps(md)) if dsmd.status_code in [500, 502, 504]: logger.error( "[%s] failed to attach metadata (%s: %s)" % (ds, dsmd.status_code, dsmd.text)) updatedTask['contents'][ds]['files'][ datasetMDFile]['retry'] = "%s: %s" % ( dsmd.status_code, dsmd.text) response = "RETRY" elif dsmd.status_code != 200: logger.error( "[%s] failed to attach metadata (%s: %s)" % (ds, dsmd.status_code, dsmd.text)) updatedTask['contents'][ds]['files'][ datasetMDFile]['error'] = "%s: %s" % ( dsmd.status_code, dsmd.text) response = "ERROR" else: if datasetMDFile: logger.info( " [%s] added metadata from .json file" % ds, extra={ "dataset_name": ds, "dataset_id": dsid, "action": "METADATA ADDED", "metadata": datasetMD }) updatedTask['contents'][ds]['files'][ datasetMDFile][ 'metadata_loaded'] = True updatedTask['contents'][ds]['files'][ datasetMDFile][ 'clowder_id'] = "attached to dataset" writeTaskToDatabase(updatedTask) else: # Remove metadata from activeTasks on success even if file upload fails in next step, so we don't repeat md logger.info(" [%s] added metadata" % ds, extra={ "dataset_name": ds, "dataset_id": dsid, "action": "METADATA ADDED", "metadata": datasetMD }) del updatedTask['contents'][ds]['md'] writeTaskToDatabase(updatedTask) if len(fileFormData) > 0: # Upload collected files for this dataset # Boundary encoding from http://stackoverflow.com/questions/17982741/python-using-reuests-library-for-multipart-form-data logger.info(" [%s] uploading unprocessed files" % ds, extra={ "dataset_id": dsid, "dataset_name": ds, "action": "UPLOADING FILES", "filelist": fileFormData }) (content, header) = encode_multipart_formdata(fileFormData) fi = sess.post(clowder_host + "/api/uploadToDataset/" + dsid, headers={'Content-Type': header}, data=content) if fi.status_code in [104, 500, 502, 504]: logger.error( "[%s] failed to attach files (%s: %s)" % (ds, fi.status_code, fi.text)) updatedTask['contents'][ds]['files'][ datasetMDFile]['retry'] = "%s: %s" % ( fi.status_code, fi.text) response = "RETRY" elif fi.status_code != 200: logger.error( "[%s] failed to attach files (%s: %s)" % (ds, fi.status_code, fi.text)) updatedTask['contents'][ds]['files'][ datasetMDFile]['error'] = "%s: %s" % ( fi.status_code, fi.text) response = "ERROR" else: loaded = fi.json() if 'ids' in loaded: for fobj in loaded['ids']: logger.info(" [%s] added file %s" % (ds, fobj['name'])) for fkey in updatedTask['contents'][ds][ 'files']: if updatedTask['contents'][ds][ 'files'][fkey]['name'] == fobj[ 'name']: updatedTask['contents'][ds][ 'files'][fkey][ 'clowder_id'] = fobj['id'] # remove any previous retry/error messages if 'retry' in updatedTask[ 'contents'][ds]['files'][ fkey]: del ( updatedTask['contents'][ds] ['files'][fkey]['retry']) if 'error' in updatedTask[ 'contents'][ds]['files'][ fkey]: del ( updatedTask['contents'][ds] ['files'][fkey]['error']) break writeTaskToDatabase(updatedTask) else: logger.info(" [%s] added file %s" % (ds, lastFile)) updatedTask['contents'][ds]['files'][ lastFileKey]['clowder_id'] = loaded['id'] # remove any previous retry/error messages if 'retry' in updatedTask['contents'][ds][ 'files'][lastFileKey]: del (updatedTask['contents'][ds]['files'] [lastFileKey]['retry']) if 'error' in updatedTask['contents'][ds][ 'files'][lastFileKey]: del (updatedTask['contents'][ds]['files'] [lastFileKey]['error']) writeTaskToDatabase(updatedTask) return response else: logger.error("%s task: no credentials for Globus user %s" % (task['globus_id'], globUser)) return "ERROR"
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message() tmp_csv = "meantemptraits.csv" csv_file = open(tmp_csv, 'w') (fields, traits) = get_traits_table() csv_file.write(','.join(map(str, fields)) + '\n') # Get full list of experiment plots using date as filter ds_info = get_info(connector, host, secret_key, resource['parent']['id']) dsmd = download_metadata(connector, host, secret_key, resource['parent']['id']) timestamp = ds_info['name'].split(" - ")[1] all_plots = get_site_boundaries(timestamp, city='Maricopa') successful_plots = 0 for plotname in all_plots: bounds = all_plots[plotname] # Use GeoJSON string to clip full field to this plot (pxarray, geotrans) = clip_raster(resource['local_paths'][0], bounds) #tc = getFlir.rawData_to_temperature(pxarray, terramd) # get temperature # Filter out any pxarray[pxarray < 0] = numpy.nan mean_tc = numpy.nanmean(pxarray) - 273.15 # Create BETY-ready CSV if not numpy.isnan(mean_tc): traits['surface_temperature'] = str(mean_tc) traits['site'] = plotname traits['local_datetime'] = timestamp + "T12:00:00" trait_list = generate_traits_list(traits) #generate_csv(tmp_csv, fields, trait_list) csv_file.write(','.join(map(str, trait_list)) + '\n') # Prepare and submit datapoint centroid_lonlat = json.loads( centroid_from_geojson(bounds))["coordinates"] time_fmt = timestamp + "T12:00:00-07:00" dpmetadata = { "source": host + ("" if host.endswith("/") else "/") + "files/" + resource['id'], "surface_temperature": str(mean_tc) } create_datapoint_with_dependencies( connector, host, secret_key, "IR Surface Temperature", (centroid_lonlat[1], centroid_lonlat[0]), time_fmt, time_fmt, dpmetadata, timestamp) successful_plots += 1 # submit CSV to BETY csv_file.close() submit_traits(tmp_csv, betykey=self.bety_key) # Tell Clowder this is completed so subsequent file updates don't daisy-chain metadata = build_metadata( host, self.extractor_info, resource['parent']['id'], { "plots_processed": successful_plots, "plots_skipped": len(all_plots) - successful_plots, "betydb_link": "https://terraref.ncsa.illinois.edu/bety/api/beta/variables?name=surface_temperature" }, 'dataset') upload_metadata(connector, host, secret_key, resource['parent']['id'], metadata) self.end_message()
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message() sensor_type, timestamp = resource['name'].split(" - ") targets = self.get_targets_by_sensor(sensor_type) source = self.get_source_by_sensor(sensor_type) existing_files = {} for t in targets: for f in resource['files']: if f['filename'].endswith(t): logging.getLogger(__name__).info("Found %s" % f['filename']) existing_files[t] = f['filename'] break if len(existing_files) == len(targets): logging.getLogger(__name__).info("Target files already exist") # If there are bin2tif files previously created, are they valid? dsmd = download_metadata(connector, host, secret_key, resource['id']) for md in dsmd: if 'extractor_id' in md['agent'] and md['agent']['extractor_id'].endswith(source): # Found bin2tif metadata - are previously created files valid? logging.getLogger(__name__).info("Found metadata from %s" % source) for url in md['content']['files_created']: fid = url.split("/")[-1] i = download_info(connector, host, secret_key, fid) i = self.remapMountPath(connector, i['filepath']) logging.getLogger(__name__).info("Checking validity of %s" % i) if not os.path.isfile(i): # Found invalid file - nuke the entire site from orbit logging.getLogger(__name__).info("Invalid; deleting metadata") self.delete_dataset_metadata(host, self.clowder_user, self.clowder_pass, resource['id'], source) # Now trigger a callback extraction if given if len(self.callback) > 0: logging.getLogger(__name__).info("Submitting callback extraction to %s" % self.callback) submit_extraction(connector, host, secret_key, resource['id'], self.callback) else: callbacks = self.get_callbacks_by_sensor(sensor_type) if callbacks: for c in callbacks: logging.getLogger(__name__).info("Submitting callback extraction to %s" % c) submit_extraction(connector, host, secret_key, resource['id'], c) else: logging.getLogger(__name__).info("No default callback found for %s" % sensor_type) break else: # Search for target source files source_dir = os.path.dirname(self.sensors.get_sensor_path_by_dataset(resource['name'])) source_dir = self.remapMountPath(connector, source_dir) if sensor_type == "scanner3DTop": source_dir = source_dir.replace("Level_1", "raw_data") logging.getLogger(__name__).info("Searching for target files in %s" % source_dir) if os.path.isdir(source_dir): targ_files = {} for f in os.listdir(source_dir): for t in targets: if f.endswith(t): targ_files[t] = os.path.join(source_dir, f) break if targ_files != {}: for t in targ_files: logging.getLogger(__name__).info("Uploading %s" % (targ_files[t])) upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, resource['id'], targ_files[t]) # Now trigger a callback extraction if given if len(self.callback) > 0: logging.getLogger(__name__).info("Submitting callback extraction to %s" % self.callback) submit_extraction(connector, host, secret_key, resource['id'], self.callback) else: callbacks = self.get_callbacks_by_sensor(sensor_type) if callbacks: for c in callbacks: logging.getLogger(__name__).info("Submitting callback extraction to %s" % c) submit_extraction(connector, host, secret_key, resource['id'], c) else: logging.getLogger(__name__).info("No default callback found for %s" % sensor_type) else: logging.getLogger(__name__).error("targets not found in %s" % source_dir) else: logging.getLogger(__name__).info("%s could not be found" % source_dir)