def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) # Load metadata from dataset for fname in resource['local_paths']: if fname.endswith('_dataset_metadata.json'): all_dsmd = load_json_file(fname) terra_md_full = get_terraref_metadata(all_dsmd) if 'spatial_metadata' in terra_md_full: spatial_meta = terra_md_full['spatial_metadata'] else: spatial_meta = None if not spatial_meta: ValueError("No spatial metadata found.") # Determine which files in dataset need clipping files_to_process = {} for f in resource['local_paths']: if f.startswith("ir_geotiff") and f.endswith(".tif"): sensor_name = "ir_geotiff" filename = os.path.basename(f) files_to_process[filename] = { "path": f, "bounds": spatial_meta['flirIrCamera']['bounding_box'] } elif f.startswith("rgb_geotiff") and f.endswith(".tif"): sensor_name = "rgb_geotiff" filename = os.path.basename(f) if f.endswith("_left.tif"): side = "left" else: side = "right" files_to_process[filename] = { "path": f, "bounds": spatial_meta[side]['bounding_box'] } elif f.endswith(".las"): sensor_name = "laser3d_las" filename = os.path.basename(f) files_to_process[filename] = { "path": f, "bounds": get_las_extents(f) } # TODO: Add case for laser3d heightmap # Fetch experiment name from terra metadata timestamp = resource['dataset_info']['name'].split(" - ")[1] season_name, experiment_name, updated_experiment = get_season_and_experiment(timestamp, 'plotclipper', terra_md_full) if None in [season_name, experiment_name]: raise ValueError("season and experiment could not be determined") # Determine script name target_scan = "unknown_scan" if 'gantry_variable_metadata' in terra_md_full: if 'script_name' in terra_md_full['gantry_variable_metadata']: target_scan = terra_md_full['gantry_variable_metadata']['script_name'] if 'script_hash' in terra_md_full['gantry_variable_metadata']: target_scan += ' '+terra_md_full['gantry_variable_metadata']['script_hash'] all_plots = get_site_boundaries(timestamp.split("__")[0], city='Maricopa') uploaded_file_ids = [] for filename in files_to_process: file_path = files_to_process[filename]["path"] file_bounds = files_to_process[filename]["bounds"] overlap_plots = find_plots_intersect_boundingbox(file_bounds, all_plots, fullmac=True) if len(overlap_plots) > 0: self.log_info(resource, "Attempting to clip %s into %s plot shards" % (filename, len(overlap_plots))) for plotname in overlap_plots: plot_bounds = overlap_plots[plotname] tuples = geojson_to_tuples_betydb(yaml.safe_load(plot_bounds)) plot_display_name = self.sensors.get_display_name(sensor=sensor_name) + " (By Plot)" leaf_dataset = plot_display_name + ' - ' + plotname + " - " + timestamp.split("__")[0] self.log_info(resource, "Hierarchy: %s / %s / %s / %s / %s / %s / %s" % (season_name, experiment_name, plot_display_name, timestamp[:4], timestamp[5:7], timestamp[8:10], leaf_dataset)) target_dsid = build_dataset_hierarchy_crawl(host, secret_key, self.clowder_user, self.clowder_pass, self.clowderspace, season_name, experiment_name, plot_display_name, timestamp[:4], timestamp[5:7], timestamp[8:10], leaf_ds_name=leaf_dataset) out_file = self.sensors.create_sensor_path(timestamp, plot=plotname, subsensor=sensor_name, filename=filename) if not os.path.exists(os.path.dirname(out_file)): os.makedirs(os.path.dirname(out_file)) if filename.endswith(".tif") and (not file_exists(out_file) or self.overwrite): """If file is a geoTIFF, simply clip it and upload it to Clowder""" clip_raster(file_path, tuples, out_path=out_file) found_in_dest = check_file_in_dataset(connector, host, secret_key, target_dsid, merged_out, remove=self.overwrite) if not found_in_dest or self.overwrite: fileid = upload_to_dataset(connector, host, secret_key, target_dsid, merged_out) uploaded_file_ids.append(host + ("" if host.endswith("/") else "/") + "files/" + fileid) self.created += 1 self.bytes += os.path.getsize(merged_out) elif filename.endswith(".las"): """If file is LAS, we can merge with any existing scan+plot output safely""" merged_out = os.path.join(os.path.dirname(out_file), target_scan+"_merged.las") merged_txt = merged_out.replace(".las", "_contents.txt") already_merged = False if os.path.exists(merged_txt): # Check if contents with open(merged_txt, 'r') as contents: for entry in contents.readlines(): if entry.strip() == file_path: already_merged = True break if not already_merged: clip_las(file_path, tuples, out_path=out_file, merged_path=merged_out) with open(merged_txt, 'a') as contents: contents.write(file_path+"\n") # Upload the individual plot shards for optimizing las2height later found_in_dest = check_file_in_dataset(connector, host, secret_key, target_dsid, out_file, remove=self.overwrite) if not found_in_dest or self.overwrite: fileid = upload_to_dataset(connector, host, secret_key, target_dsid, out_file) uploaded_file_ids.append(host + ("" if host.endswith("/") else "/") + "files/" + fileid) self.created += 1 self.bytes += os.path.getsize(out_file) # Upload the merged result if necessary found_in_dest = check_file_in_dataset(connector, host, secret_key, target_dsid, merged_out, remove=self.overwrite) if not found_in_dest or self.overwrite: fileid = upload_to_dataset(connector, host, secret_key, target_dsid, merged_out) uploaded_file_ids.append(host + ("" if host.endswith("/") else "/") + "files/" + fileid) self.created += 1 self.bytes += os.path.getsize(merged_out) # Trigger las2height extractor submit_extraction(connector, host, secret_key, target_dsid, "terra.3dscanner.las2height") # Tell Clowder this is completed so subsequent file updates don't daisy-chain extractor_md = build_metadata(host, self.extractor_info, resource['id'], { "files_created": uploaded_file_ids }, 'dataset') self.log_info(resource, "uploading extractor metadata to Level_1 dataset") remove_metadata(connector, host, secret_key, resource['id'], self.extractor_info['name']) upload_metadata(connector, host, secret_key, resource['id'], extractor_md) self.end_message(resource)
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) # Write the CSV to the same directory as the source file ds_info = get_info(connector, host, secret_key, resource['parent']['id']) timestamp = ds_info['name'].split(" - ")[1] time_fmt = timestamp + "T12:00:00-07:00" rootdir = self.sensors.create_sensor_path(timestamp, sensor="rgb_fullfield", ext=".csv") out_csv = os.path.join( os.path.dirname(rootdir), resource['name'].replace(".tif", "_canopycover_bety.csv")) out_geo = os.path.join( os.path.dirname(rootdir), resource['name'].replace(".tif", "_canopycover_geo.csv")) # TODO: What should happen if CSV already exists? If we're here, there's no completed metadata... self.log_info(resource, "Writing BETY CSV to %s" % out_csv) csv_file = open(out_csv, 'w') (fields, traits) = get_traits_table() csv_file.write(','.join(map(str, fields)) + '\n') self.log_info(resource, "Writing Geostreams CSV to %s" % out_geo) geo_file = open(out_geo, 'w') geo_file.write(','.join([ 'site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp' ]) + '\n') # Get full list of experiment plots using date as filter all_plots = get_site_boundaries(timestamp, city='Maricopa') self.log_info(resource, "found %s plots on %s" % (len(all_plots), timestamp)) successful_plots = 0 for plotname in all_plots: if plotname.find("KSU") > -1: self.log_info(resource, "skipping %s" % plotname) continue bounds = all_plots[plotname] tuples = geojson_to_tuples_betydb(yaml.safe_load(bounds)) centroid_lonlat = json.loads( centroid_from_geojson(bounds))["coordinates"] # Use GeoJSON string to clip full field to this plot try: pxarray = clip_raster(resource['local_paths'][0], tuples) if pxarray is not None: if len(pxarray.shape) < 3: self.log_error( resource, "unexpected array shape for %s (%s)" % (plotname, pxarray.shape)) continue ccVal = calculate_canopycover_masked( rollaxis(pxarray, 0, 3)) if (ccVal > -1): # Prepare and submit datapoint geo_file.write(','.join([ plotname, 'Canopy Cover', str(centroid_lonlat[1]), str(centroid_lonlat[0]), time_fmt, host + ("" if host.endswith("/") else "/") + "files/" + resource['id'], str(ccVal), timestamp ]) + '\n') successful_plots += 1 if successful_plots % 10 == 0: self.log_info( resource, "processed %s/%s plots" % (successful_plots, len(all_plots))) else: continue except: self.log_error(resource, "error generating cc for %s" % plotname) continue if (ccVal > -1): traits['canopy_cover'] = str(ccVal) traits['site'] = plotname traits['local_datetime'] = timestamp + "T12:00:00" trait_list = generate_traits_list(traits) csv_file.write(','.join(map(str, trait_list)) + '\n') csv_file.close() geo_file.close() # Upload this CSV to Clowder fileid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, resource['parent']['id'], out_csv) geoid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, resource['parent']['id'], out_geo) # Add metadata to original dataset indicating this was run self.log_info(resource, "updating file metadata") ext_meta = build_metadata(host, self.extractor_info, resource['id'], {"files_created": [fileid, geoid]}, 'file') upload_metadata(connector, host, secret_key, resource['id'], ext_meta) # Trigger separate extractors self.log_info(resource, "triggering BETY extractor on %s" % fileid) submit_extraction(connector, host, secret_key, fileid, "terra.betydb") self.log_info(resource, "triggering geostreams extractor on %s" % geoid) submit_extraction(connector, host, secret_key, geoid, "terra.geostreams") self.end_message(resource)
def perform_process(transformer: transformer_class.Transformer, check_md: dict, transformer_md: dict, full_md: dict) -> dict: """Performs the processing of the data Arguments: transformer: instance of transformer class Return: Returns a dictionary with the results of processing """ # Setup local variables timestamp = dateutil.parser.parse(check_md['timestamp']) datestamp = timestamp.strftime("%Y-%m-%d") localtime = timestamp.strftime("%Y-%m-%dT%H:%M:%S") geo_csv_filename = os.path.join(check_md['working_folder'], "canopycover_geostreams.csv") bety_csv_filename = os.path.join(check_md['working_folder'], "canopycover.csv") geo_file = open(geo_csv_filename, 'w') bety_file = open(bety_csv_filename, 'w') (fields, traits) = get_traits_table() # Setup default trait values if not transformer.args.germplasmName is None: traits['species'] = transformer.args.germplasmName if not transformer.args.citationAuthor is None: traits['citation_author'] = transformer.args.citationAuthor if not transformer.args.citationTitle is None: traits['citation_title'] = transformer.args.citationTitle if not transformer.args.citationYear is None: traits['citation_year'] = transformer.args.citationYear else: traits['citation_year'] = (timestamp.year) geo_csv_header = ','.join(['site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp']) bety_csv_header = ','.join(map(str, fields)) if geo_file: geo_file.write(geo_csv_header + "\n") if bety_file: bety_file.write(bety_csv_header + "\n") all_plots = get_site_boundaries(datestamp, city='Maricopa') logging.debug("Found %s plots for date %s", str(len(all_plots)), str(datestamp)) # Loop through finding all image files image_exts = SUPPORTED_IMAGE_EXTS num_files = 0 total_plots_calculated = 0 logging.debug("Looking for images with an extension of: %s", ",".join(image_exts)) for one_file in check_md['list_files'](): ext = os.path.splitext(one_file)[1] if not ext or not ext in image_exts: logging.debug("Skipping non-supported file '%s'", one_file) continue image_bounds = get_image_bounds(one_file) if not image_bounds: logging.info("Image file does not appear to be geo-referenced '%s'", one_file) continue overlap_plots = find_plots_intersect_boundingbox(image_bounds, all_plots, fullmac=True) num_plots = len(overlap_plots) if not num_plots or num_plots < 0: logging.info("No plots intersect file '%s'", one_file) continue num_files += 1 image_spatial_ref = get_spatial_reference_from_json(image_bounds) for plot_name in overlap_plots: plot_bounds = convert_json_geometry(overlap_plots[plot_name], image_spatial_ref) tuples = geojson_to_tuples_betydb(yaml.safe_load(plot_bounds)) centroid = json.loads(centroid_from_geojson(plot_bounds))["coordinates"] try: logging.debug("Clipping raster to plot") pxarray = clip_raster(one_file, tuples, os.path.join(check_md['working_folder'], "temp.tif")) if pxarray is not None: if len(pxarray.shape) < 3: logging.warning("Unexpected image dimensions for file '%s'", one_file) logging.warning(" expected 3 and received %s", str(pxarray.shape)) break logging.debug("Calculating canopy cover") cc_val = calculate_canopycover_masked(np.rollaxis(pxarray, 0, 3)) # Write the datapoint geographically and otherwise logging.debug("Writing to CSV files") if geo_file: csv_data = ','.join([plot_name, 'Canopy Cover', str(centroid[1]), str(centroid[0]), localtime, one_file, str(cc_val), datestamp]) geo_file.write(csv_data + "\n") if bety_file: traits['canopy_cover'] = str(cc_val) traits['site'] = plot_name traits['local_datetime'] = localtime trait_list = generate_traits_list(traits) csv_data = ','.join(map(str, trait_list)) bety_file.write(csv_data + "\n") total_plots_calculated += 1 else: continue except Exception as ex: logging.warning("Exception caught while processing canopy cover: %s", str(ex)) logging.warning("Error generating canopy cover for '%s'", one_file) logging.warning(" plot name: '%s'", plot_name) continue # Check that we got something if not num_files: return {'code': -1000, 'error': "No files were processed"} if not total_plots_calculated: return {'code': -1001, 'error': "No plots intersected with the images provided"} # Setup the metadata for returning files file_md = [] if geo_file: file_md.append({'path': geo_csv_filename, 'key': 'csv'}) if bety_file: file_md.append({'path': bety_csv_filename, 'key': 'csv'}) # Perform cleanup if geo_file: geo_file.close() del geo_file if bety_file: bety_file.close() del bety_file return {'code': 0, 'files': file_md}
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) # Get full list of experiment plots using date as filter ds_info = get_info(connector, host, secret_key, resource['parent']['id']) timestamp = ds_info['name'].split(" - ")[1] time_fmt = timestamp+"T12:00:00-07:00" out_csv = self.sensors.create_sensor_path(timestamp, sensor="ir_meantemp", opts=["bety"]) out_geo = out_csv.replace("_bety", "_geo") # TODO: What should happen if CSV already exists? If we're here, there's no completed metadata... self.log_info(resource, "Writing BETY CSV to %s" % out_csv) csv_file = open(out_csv, 'w') (fields, traits) = get_traits_table() csv_file.write(','.join(map(str, fields)) + '\n') self.log_info(resource, "Writing Geostreams CSV to %s" % out_geo) geo_file = open(out_geo, 'w') geo_file.write(','.join(['site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp']) + '\n') successful_plots = 0 nan_plots = 0 all_plots = get_site_boundaries(timestamp, city='Maricopa') for plotname in all_plots: if plotname.find("KSU") > -1: self.log_info(resource, "skipping %s" % plotname) continue bounds = all_plots[plotname] tuples = geojson_to_tuples_betydb(yaml.safe_load(bounds)) centroid_lonlat = json.loads(centroid_from_geojson(bounds))["coordinates"] # Use GeoJSON string to clip full field to this plot pxarray = clip_raster(resource['local_paths'][0], tuples) # Filter out any pxarray[pxarray < 0] = numpy.nan mean_tc = numpy.nanmean(pxarray) - 273.15 # Create BETY-ready CSV if not numpy.isnan(mean_tc): geo_file.write(','.join([plotname, 'IR Surface Temperature', str(centroid_lonlat[1]), str(centroid_lonlat[0]), time_fmt, host + ("" if host.endswith("/") else "/") + "files/" + resource['id'], str(mean_tc), timestamp]) + '\n') traits['surface_temperature'] = str(mean_tc) traits['site'] = plotname traits['local_datetime'] = timestamp+"T12:00:00" trait_list = generate_traits_list(traits) csv_file.write(','.join(map(str, trait_list)) + '\n') else: nan_plots += 1 successful_plots += 1 self.log_info(resource, "skipped %s of %s plots due to NaN" % (nan_plots, len(all_plots))) # submit CSV to BETY csv_file.close() geo_file.close() # Upload CSVs to Clowder fileid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, resource['parent']['id'], out_csv) geoid = upload_to_dataset(connector, host, self.clowder_user, self.clowder_pass, resource['parent']['id'], out_geo) # Tell Clowder this is completed so subsequent file updates don't daisy-chain self.log_info(resource, "updating file metadata") metadata = build_metadata(host, self.extractor_info, resource['parent']['id'], { "total_plots": len(all_plots), "plots_processed": successful_plots, "blank_plots": nan_plots, "files_created": [fileid, geoid], "betydb_link": "https://terraref.ncsa.illinois.edu/bety/api/beta/variables?name=surface_temperature" }, 'dataset') upload_metadata(connector, host, secret_key, resource['parent']['id'], metadata) # Trigger downstream extractors self.log_info(resource, "triggering BETY extractor on %s" % fileid) submit_extraction(connector, host, secret_key, fileid, "terra.betydb") self.log_info(resource, "triggering geostreams extractor on %s" % geoid) submit_extraction(connector, host, secret_key, geoid, "terra.geostreams") self.end_message(resource)
def perform_process(transformer: transformer_class.Transformer, check_md: dict, transformer_md: list, full_md: list) -> dict: """Performs the processing of the data Arguments: transformer: instance of transformer class Return: Returns a dictionary with the results of processing """ # pylint: disable=unused-argument # Disabling pylint checks because resolving them would make code unreadable # pylint: disable=too-many-branches, too-many-statements, too-many-locals # Setup local variables start_timestamp = datetime.datetime.now() timestamp = dateutil.parser.parse(check_md['timestamp']) datestamp = timestamp.strftime("%Y-%m-%d") localtime = timestamp.strftime("%Y-%m-%dT%H:%M:%S") geo_csv_filename = os.path.join(check_md['working_folder'], "meantemp_geostreams.csv") bety_csv_filename = os.path.join(check_md['working_folder'], "meantemp.csv") geo_file = open(geo_csv_filename, 'w') bety_file = open(bety_csv_filename, 'w') (fields, traits) = get_traits_table() # Setup default trait values if transformer.args.citationAuthor is not None: traits['citation_author'] = transformer.args.citationAuthor if transformer.args.citationTitle is not None: traits['citation_title'] = transformer.args.citationTitle if transformer.args.citationYear is not None: traits['citation_year'] = transformer.args.citationYear else: traits['citation_year'] = timestamp.year geo_csv_header = ','.join([ 'site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp' ]) bety_csv_header = ','.join(map(str, fields)) if geo_file: geo_file.write(geo_csv_header + "\n") if bety_file: bety_file.write(bety_csv_header + "\n") all_plots = get_site_boundaries(datestamp, city='Maricopa') logging.debug("Found %s plots for date %s", str(len(all_plots)), str(datestamp)) # Loop through finding all image files image_exts = SUPPORTED_IMAGE_EXTS num_files = 0 number_empty_plots = 0 total_plots_calculated = 0 total_files = 0 processed_plots = 0 logging.debug("Looking for images with an extension of: %s", ",".join(image_exts)) for one_file in check_md['list_files'](): total_files += 1 ext = os.path.splitext(one_file)[1] if not ext or ext not in image_exts: logging.debug("Skipping non-supported file '%s'", one_file) continue image_bounds = get_image_bounds(one_file) if not image_bounds: logging.info( "Image file does not appear to be geo-referenced '%s'", one_file) continue overlap_plots = find_plots_intersect_boundingbox(image_bounds, all_plots, fullmac=True) num_plots = len(overlap_plots) if not num_plots or num_plots < 0: logging.info("No plots intersect file '%s'", one_file) continue num_files += 1 image_spatial_ref = get_spatial_reference_from_json(image_bounds) for plot_name in overlap_plots: processed_plots += 1 plot_bounds = convert_json_geometry(overlap_plots[plot_name], image_spatial_ref) tuples = geojson_to_tuples_betydb(yaml.safe_load(plot_bounds)) centroid = json.loads( centroid_from_geojson(plot_bounds))["coordinates"] try: logging.debug("Clipping raster to plot") clip_path = os.path.join(check_md['working_folder'], "temp.tif") pxarray = clip_raster(one_file, tuples, clip_path) if os.path.exists(clip_path): os.remove(clip_path) if pxarray is not None: logging.debug("Calculating mean temperature") pxarray[pxarray < 0] = np.nan mean_tc = np.nanmean(pxarray) - 273.15 # Check for empty plots if np.isnan(mean_tc): number_empty_plots += 1 continue # Write the data point geographically and otherwise logging.debug("Writing to CSV files") if geo_file: csv_data = ','.join([ plot_name, 'IR Surface Temperature', str(centroid[1]), str(centroid[0]), localtime, one_file, str(mean_tc), datestamp ]) geo_file.write(csv_data + "\n") if bety_file: traits['surface_temperature'] = str(mean_tc) traits['site'] = plot_name traits['local_datetime'] = localtime trait_list = generate_traits_list(traits) csv_data = ','.join(map(str, trait_list)) bety_file.write(csv_data + "\n") total_plots_calculated += 1 else: continue except Exception as ex: logging.warning( "Exception caught while processing mean temperature: %s", str(ex)) logging.warning("Error generating mean temperature for '%s'", one_file) logging.warning(" plot name: '%s'", plot_name) continue # Check that we got something if not num_files: return {'code': -1000, 'error': "No files were processed"} if not total_plots_calculated: return { 'code': -1001, 'error': "No plots intersected with the images provided" } # Setup the metadata for returning files file_md = [] if geo_file: file_md.append({'path': geo_csv_filename, 'key': 'csv'}) if bety_file: file_md.append({'path': bety_csv_filename, 'key': 'csv'}) # Perform cleanup if geo_file: geo_file.close() if bety_file: bety_file.close() return { 'code': 0, 'files': file_md, configuration.TRANSFORMER_NAME: { 'version': configuration.TRANSFORMER_VERSION, 'utc_timestamp': datetime.datetime.utcnow().isoformat(), 'processing_time': str(datetime.datetime.now() - start_timestamp), 'total_file_count': total_files, 'processed_file_count': num_files, 'total_plots_processed': processed_plots, 'empty_plots': number_empty_plots } }
geo_csv = args.input.replace(".tif", "_canopycover_geo.csv") csv_file = open(bety_csv, 'w') (fields, traits) = get_traits_table() csv_file.write(','.join(map(str, fields)) + '\n') geo_file = open(geo_csv, 'w') geo_file.write(','.join([ 'site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp' ]) + '\n') all_plots = get_site_boundaries(date, city='Maricopa') successful_plots = 0 for plotname in all_plots: logger.debug("Processing plot %s" % plotname) bounds = all_plots[plotname] tuples = geojson_to_tuples_betydb(yaml.safe_load(bounds)) centroid_lonlat = json.loads(centroid_from_geojson(bounds))["coordinates"] # Use GeoJSON string to clip full field to this plot try: pxarray = clip_raster(args.input, tuples) if pxarray is not None: pxarray = rollaxis(pxarray, 0, 3) if len(pxarray.shape) < 3: logger.error("unexpected array shape for %s (%s)" % (plotname, pxarray.shape)) continue ccVal = terraref.stereo_rgb.calculate_canopycover(pxarray) successful_plots += 1