def submit_extractions_by_dataset(connector, host, key, datasetid, extractorname, ext=False): """Manually trigger an extraction on all files in a dataset. This will iterate through all files in the given dataset and submit them to the provided extractor. Keyword arguments: connector -- connector information, used to get missing parameters and send status updates host -- the clowder host, including http and port, should end with a / key -- the secret key to login to clowder datasetid -- the dataset UUID to submit extractorname -- registered name of extractor to trigger ext -- extension to filter. e.g. 'tif' will only submit TIFF files for extraction. """ filelist = get_file_list(connector, host, key, datasetid) for f in filelist: # Only submit files that end with given extension, if specified if ext and not f['filename'].endswith(ext): continue submit_extraction(connector, host, key, f['id'], extractorname)
def check_file_in_dataset(connector, host, secret_key, dsid, filepath, remove=False, forcepath=False, replacements=[]): # Replacements = [("L2","L1")] # Each tuple is checked replacing first element in filepath with second element for existing dest_files = get_file_list(connector, host, secret_key, dsid) if len(replacements) > 0: for r in replacements: filepath = filepath.replace(r[0], r[1]) for source_path in connector.mounted_paths: if filepath.startswith(connector.mounted_paths[source_path]): filepath = filepath.replace(connector.mounted_paths[source_path], source_path) filename = os.path.basename(filepath) found_file = False for f in dest_files: if (not forcepath and f['filename'] == filename) or (forcepath and f['filepath'] == filepath): if remove: delete_file(host, secret_key, f['id']) found_file = True return found_file
def upload_to_dataset(connector, host, key, datasetid, filepath, check_duplicate=False): """Upload file to existing Clowder dataset. Keyword arguments: connector -- connector information, used to get missing parameters and send status updates host -- the clowder host, including http and port, should end with a / key -- the secret key to login to clowder datasetid -- the dataset that the file should be associated with filepath -- path to file check_duplicate -- check if filename already exists in dataset and skip upload if so """ logger = logging.getLogger(__name__) if check_duplicate: ds_files = get_file_list(connector, host, key, datasetid) for f in ds_files: if f['filename'] == os.path.basename(filepath): logger.debug("found %s in dataset %s; not re-uploading" % (f['filename'], datasetid)) return None for source_path in connector.mounted_paths: if filepath.startswith(connector.mounted_paths[source_path]): return _upload_to_dataset_local(connector, host, key, datasetid, filepath) url = '%sapi/uploadToDataset/%s?key=%s' % (host, datasetid, key) if os.path.exists(filepath): result = connector.post( url, files={"File": open(filepath, 'rb')}, verify=connector.ssl_verify if connector else True) uploadedfileid = result.json()['id'] logger.debug("uploaded file id = [%s]", uploadedfileid) return uploadedfileid else: logger.error("unable to upload file %s (not found)", filepath)
def submit_missing_regex(sensor_name, target, date): sensordef = count_defs[sensor_name] targetdef = sensordef[target] extractorname = targetdef["extractor"] submitted = [] notfound = [] if "parent" in targetdef: # Count expected parent counts from filesystem parentdef = sensordef[targetdef["parent"]] parent_dir = os.path.join(parentdef["path"], date) if parentdef["type"] == "regex" and parentdef["path"] == targetdef["path"]: for file in os.listdir(parent_dir): if re.match(parentdef["regex"], file): expected_output = file.replace(targetdef["parent_replacer_check"][1], targetdef["parent_replacer_check"][0]) if not os.path.isfile(os.path.join(parent_dir, expected_output)): # Find the file ID of the parent file and submit it dataset_name = parentdef["dispname"]+" - "+date dsid = get_dsid_by_name(dataset_name) if dsid: parent_id = None dsfiles = get_file_list(CONN, CLOWDER_HOST, CLOWDER_KEY, dsid) matchfile = file.replace("_thumb.tif", ".tif") for dsfile in dsfiles: if dsfile["filename"] == matchfile: parent_id = dsfile["id"] break if parent_id: submit_file_extraction(CONN, CLOWDER_HOST, CLOWDER_KEY, parent_id, extractorname) submitted.append({"name": matchfile, "id": parent_id}) else: notfound.append({"name": matchfile}) else: notfound.append({"name": dataset_name}) return json.dumps({ "extractor": extractorname, "datasets submitted": submitted, "datasets not found": notfound })
def process_message(self, connector, host, secret_key, resource, parameters): self.start_message(resource) # Build list of JSON files json_files = [] for f in resource['files']: if f['filename'].endswith("_environmentlogger.json"): if f['filepath'].startswith("/home/clowder"): json_files.append(f['filepath'].replace( "/home/clowder", "/home/extractor")) else: json_files.append(f['filepath']) json_files.sort() # Determine full output path timestamp = resource['name'].split(" - ")[1] out_fullday_netcdf = self.sensors.create_sensor_path(timestamp) temp_out_full = os.path.join(os.path.dirname(out_fullday_netcdf), "temp_full.nc") temp_out_single = temp_out_full.replace("_full.nc", "_single.nc") geo_csv = out_fullday_netcdf.replace(".nc", "_geo.csv") if not file_exists(temp_out_full): for json_file in json_files: self.log_info( resource, "converting %s to netCDF & appending" % os.path.basename(json_file)) ela.mainProgramTrigger(json_file, temp_out_single) cmd = "ncrcat --record_append %s %s" % (temp_out_single, temp_out_full) subprocess.call([cmd], shell=True) os.remove(temp_out_single) shutil.move(temp_out_full, out_fullday_netcdf) self.created += 1 self.bytes += os.path.getsize(out_fullday_netcdf) # Write out geostreams.csv if not file_exists(geo_csv): self.log_info(resource, "writing geostreams CSV") geo_file = open(geo_csv, 'w') geo_file.write(','.join([ 'site', 'trait', 'lat', 'lon', 'dp_time', 'source', 'value', 'timestamp' ]) + '\n') with Dataset(out_fullday_netcdf, "r") as ncdf: streams = set([ sensor_info.name for sensor_info in ncdf.variables.values() if sensor_info.name.startswith('sensor') ]) for stream in streams: if stream != "sensor_spectrum": try: memberlist = ncdf.get_variables_by_attributes( sensor=stream) for members in memberlist: data_points = _produce_attr_dict(members) for index in range(len(data_points)): dp_obj = data_points[index] if dp_obj["sensor"] == stream: time_format = "%Y-%m-%dT%H:%M:%S-07:00" time_point = (datetime.datetime(year=1970, month=1, day=1) + \ datetime.timedelta(days=ncdf.variables["time"][index])).strftime(time_format) geo_file.write(','.join([ "Full Field - Environmental Logger", "(EL) %s" % stream, str(33.075576), str(-111.974304), time_point, host + ("" if host.endswith("/") else "/" ) + "datasets/" + resource['id'], '"%s"' % json.dumps(dp_obj). replace('"', '""'), timestamp ]) + '\n') except: self.log_error( resource, "NetCDF attribute not found: %s" % stream) # Fetch dataset ID by dataset name if not provided target_dsid = build_dataset_hierarchy_crawl( host, secret_key, self.clowder_user, self.clowder_pass, self.clowderspace, None, None, self.sensors.get_display_name(), timestamp[:4], timestamp[5:7], timestamp[8:10], leaf_ds_name=self.sensors.get_display_name() + ' - ' + timestamp) ds_files = get_file_list(connector, host, secret_key, target_dsid) found_full = False found_csv = False for f in ds_files: if f['filename'] == os.path.basename(out_fullday_netcdf): found_full = True if f['filename'] == os.path.basename(geo_csv): found_csv = True if not found_full: upload_to_dataset(connector, host, secret_key, target_dsid, out_fullday_netcdf) if not found_csv: geoid = upload_to_dataset(connector, host, secret_key, target_dsid, geo_csv) self.log_info(resource, "triggering geostreams extractor on %s" % geoid) submit_extraction(connector, host, secret_key, geoid, "terra.geostreams") # Tell Clowder this is completed so subsequent file updates don't daisy-chain ext_meta = build_metadata(host, self.extractor_info, resource['id'], {"output_dataset": target_dsid}, 'dataset') upload_metadata(connector, host, secret_key, resource['id'], ext_meta) self.end_message(resource)