def ingest(objectid, dsets_file, grq_update_url, dataset_processed_queue, prod_path, job_path, dry_run=False, force=False): """Run dataset ingest.""" logger.info("#" * 80) logger.info("datasets: %s" % dsets_file) logger.info("grq_update_url: %s" % grq_update_url) logger.info("dataset_processed_queue: %s" % dataset_processed_queue) logger.info("prod_path: %s" % prod_path) logger.info("job_path: %s" % job_path) logger.info("dry_run: %s" % dry_run) logger.info("force: %s" % force) # get default job path if job_path is None: job_path = os.getcwd() # detect job info job = {} job_json = os.path.join(job_path, '_job.json') if os.path.exists(job_json): with open(job_json) as f: try: job = json.load(f) except Exception, e: logger.warn("Failed to read job json:\n{}".format(str(e)))
def offline_jobs(event): """Set job status to job-offline.""" time_end = datetime.utcnow().isoformat() + "Z" query = { "query": { "bool": { "must": [ {"term": {"celery_hostname": event["hostname"]}}, {"term": {"status": "job-started"}}, ] } } } logger.info("offline jobs query: %s" % json.dumps(query)) uuids = [] try: job_status_jsons = mozart_es.query(index="job_status-current", body=query) logger.info("Got {} jobs for {}.".format(len(job_status_jsons), event["hostname"])) for job_status in job_status_jsons: job_status_json = job_status['_source'] uuid = job_status_json["uuid"] # offline the job only if it hasn't been picked up by another worker cur_job_status = get_val_via_socket(JOB_STATUS_KEY_TMPL % uuid) cur_job_worker = get_val_via_socket(TASK_WORKER_KEY_TMPL % uuid) logger.info("cur_job_status: {}".format(cur_job_status)) logger.info("cur_job_worker: {}".format(cur_job_worker)) if cur_job_status == "job-started" and cur_job_worker == event["hostname"]: job_status_json["status"] = "job-offline" job_status_json["error"] = "Received worker-offline event during job execution." job_status_json["short_error"] = "worker-offline" job_status_json.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end log_job_status(job_status_json) logger.info("Offlined job with UUID %s" % uuid) uuids.append(uuid) else: logger.info("Not offlining job with UUID %s since real-time job status doesn't match" % uuid) except Exception as e: logger.warn("Got exception trying to update task events for offline worker %s: %s\n%s" % ( event["hostname"], str(e), traceback.format_exc()))
def check_dataset(id, es_index="grq"): """Query for dataset with specified input ID.""" query = { "query": { "bool": { "must": [ { "term": { "_id": id } }, ] } }, "fields": [], } es_url = app.conf['GRQ_ES_URL'] if es_url.endswith('/'): search_url = '%s%s/_search' % (es_url, es_index) else: search_url = '%s/%s/_search' % (es_url, es_index) r = requests.post(search_url, data=json.dumps(query)) if r.status_code == 200: result = r.json() total = result['hits']['total'] else: logger.warn("Failed to query %s:\n%s" % (es_url, r.text)) logger.warn("query: %s" % json.dumps(query, indent=2)) logger.warn("returned: %s" % r.text) if r.status_code == 404: total = 0 else: r.raise_for_status() return total
if dry_run: logger.info("Would've published %s to %s" % (local_prod_path, pub_path_url)) else: publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name) orig_publ_ctx_file = publ_ctx_file + '.orig' try: publish_dataset(local_prod_path, pub_path_url, params=osaka_params, force=force, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url) except NoClobberPublishContextException, e: logger.warn( "A publish context file was found at {}. Retrieving.". format(publ_ctx_url)) osaka.main.get(publ_ctx_url, orig_publ_ctx_file, params=osaka_params) with open(orig_publ_ctx_file) as f: orig_publ_ctx = json.load(f) logger.warn("original publish context: {}".format( json.dumps(orig_publ_ctx, indent=2, sort_keys=True))) orig_payload_id = orig_publ_ctx.get('payload_id', None) orig_payload_hash = orig_publ_ctx.get('payload_hash', None) orig_task_id = orig_publ_ctx.get('task_id', None) logger.warn("orig payload_id: {}".format(orig_payload_id)) logger.warn("orig payload_hash: {}".format(orig_payload_hash)) logger.warn("orig task_id: {}".format(orig_payload_id))
def ensure_image_loaded(image_name, image_url, cache_dir): """Pull docker image into local repo.""" # check if image is in local docker repo try: registry = app.conf.get("CONTAINER_REGISTRY", None) # Custom edit to load image from registry try: if registry is not None: logger.info( "Trying to load docker image {} from registry '{}'".format( image_name, registry)) registry_url = os.path.join(registry, image_name) logger.info("docker pull {}".format(registry_url)) check_output(['docker', 'pull', registry_url]) logger.info("docker tag {} {}".format(registry_url, image_name)) check_output(['docker', 'tag', registry_url, image_name]) except Exception as e: logger.warn( "Unable to load docker image from registry '{}': {}".format( registry, e)) image_info = check_output(['docker', 'inspect', image_name]) logger.info("Docker image %s cached in repo" % image_name) except: logger.info("Failed to inspect docker image %s" % image_name) # pull image from url if image_url is not None: image_file = os.path.join(cache_dir, os.path.basename(image_url)) if not os.path.exists(image_file): logger.info("Downloading image %s (%s) from %s" % (image_file, image_name, image_url)) try: osaka.main.get(image_url, image_file) except Exception as e: raise RuntimeError("Failed to download image {}:\n{}".format(image_url, str(e))) logger.info("Downloaded image %s (%s) from %s" % (image_file, image_name, image_url)) load_lock = "{}.load.lock".format(image_file) try: with atomic_write(load_lock) as f: f.write("%sZ\n" % datetime.utcnow().isoformat()) logger.info("Loading image %s (%s)" % (image_file, image_name)) p = Popen(['docker', 'load', '-i', image_file], stderr=PIPE, stdout=PIPE) stdout, stderr = p.communicate() if p.returncode != 0: raise RuntimeError("Failed to load image {} ({}): {}".format(image_file, image_name, stderr.decode())) logger.info("Loaded image %s (%s)" % (image_file, image_name)) try: os.unlink(image_file) except: pass try: os.unlink(load_lock) except: pass except OSError as e: if e.errno == 17: logger.info("Waiting for image %s (%s) to load" % (image_file, image_name)) inspect_image(image_name) else: raise else: # pull image from docker hub logger.info("Pulling image %s from docker hub" % image_name) check_output(['docker', 'pull', image_name]) logger.info("Pulled image %s from docker hub" % image_name) image_info = check_output(['docker', 'inspect', image_name]) logger.info("image info for %s: %s" % (image_name, image_info.decode())) return json.loads(image_info)[0]
def ingest( objectid, dsets_file, grq_update_url, dataset_processed_queue, prod_path, job_path, dry_run=False, force=False, ): """Run dataset ingest.""" logger.info("#" * 80) logger.info("datasets: %s" % dsets_file) logger.info("grq_update_url: %s" % grq_update_url) logger.info("dataset_processed_queue: %s" % dataset_processed_queue) logger.info("prod_path: %s" % prod_path) logger.info("job_path: %s" % job_path) logger.info("dry_run: %s" % dry_run) logger.info("force: %s" % force) # get default job path if job_path is None: job_path = os.getcwd() # detect job info job = {} job_json = os.path.join(job_path, "_job.json") if os.path.exists(job_json): with open(job_json) as f: try: job = json.load(f) except Exception as e: logger.warn("Failed to read job json:\n{}".format(str(e))) task_id = job.get("task_id", None) payload_id = (job.get("job_info", {}).get("job_payload", {}).get("payload_task_id", None)) payload_hash = job.get("job_info", {}).get("payload_hash", None) logger.info("task_id: %s" % task_id) logger.info("payload_id: %s" % payload_id) logger.info("payload_hash: %s" % payload_hash) # get dataset if os.path.isdir(prod_path): local_prod_path = prod_path else: local_prod_path = get_remote_dav(prod_path) if not os.path.isdir(local_prod_path): raise RuntimeError("Failed to find local dataset directory: %s" % local_prod_path) # write publish context publ_ctx_name = "_publish.context.json" publ_ctx_dir = mkdtemp(prefix=".pub_context", dir=job_path) publ_ctx_file = os.path.join(publ_ctx_dir, publ_ctx_name) with open(publ_ctx_file, "w") as f: json.dump( { "payload_id": payload_id, "payload_hash": payload_hash, "task_id": task_id, }, f, indent=2, sort_keys=True, ) publ_ctx_url = None # dataset name pname = os.path.basename(local_prod_path) # dataset file dataset_file = os.path.join(local_prod_path, "%s.dataset.json" % pname) # get dataset json with open(dataset_file) as f: dataset = json.load(f) logger.info("Loaded dataset JSON from file: %s" % dataset_file) # check minimum requirements for dataset JSON logger.info("Verifying dataset JSON...") verify_dataset(dataset) logger.info("Dataset JSON verfication succeeded.") # get version version = dataset["version"] # recognize r = Recognizer(dsets_file, local_prod_path, objectid, version) # get extractor extractor = r.getMetadataExtractor() if extractor is not None: match = SCRIPT_RE.search(extractor) if match: extractor = match.group(1) logger.info("Configured metadata extractor: %s" % extractor) # metadata file metadata_file = os.path.join(local_prod_path, "%s.met.json" % pname) # metadata seed file seed_file = os.path.join(local_prod_path, "met.json") # metadata file already here if os.path.exists(metadata_file): with open(metadata_file) as f: metadata = json.load(f) logger.info("Loaded metadata from existing file: %s" % metadata_file) else: if extractor is None: logger.info( "No metadata extraction configured. Setting empty metadata.") metadata = {} else: logger.info("Running metadata extractor %s on %s" % (extractor, local_prod_path)) m = check_output([extractor, local_prod_path]) logger.info("Output: %s" % m.decode()) # generate json to update metadata and urls metadata = json.loads(m) # set data_product_name metadata["data_product_name"] = objectid # merge with seed metadata if os.path.exists(seed_file): with open(seed_file) as f: seed = json.load(f) metadata.update(seed) logger.info("Loaded seed metadata from file: %s" % seed_file) # write it out to file with open(metadata_file, "w") as f: json.dump(metadata, f, indent=2) logger.info("Wrote metadata to %s" % metadata_file) # delete seed file if os.path.exists(seed_file): os.unlink(seed_file) logger.info("Deleted seed file %s." % seed_file) # read context context_file = os.path.join(local_prod_path, "%s.context.json" % pname) if os.path.exists(context_file): with open(context_file) as f: context = json.load(f) logger.info("Loaded context from existing file: %s" % context_file) else: context = {} # set metadata and dataset groups in recognizer r.setDataset(dataset) r.setMetadata(metadata) # get ipath ipath = r.getIpath() # get level level = r.getLevel() # get type dtype = r.getType() # set product metrics prod_metrics = {"ipath": ipath, "path": local_prod_path} # publish dataset if r.publishConfigured(): logger.info("Dataset publish is configured.") # get publish path pub_path_url = r.getPublishPath() # get publish urls pub_urls = [i for i in r.getPublishUrls()] # get S3 profile name and api keys for dataset publishing s3_secret_key, s3_access_key = r.getS3Keys() s3_profile = r.getS3Profile() # set osaka params osaka_params = {} # S3 profile takes precedence over explicit api keys if s3_profile is not None: osaka_params["profile_name"] = s3_profile else: if s3_secret_key is not None and s3_access_key is not None: osaka_params["aws_access_key_id"] = s3_access_key osaka_params["aws_secret_access_key"] = s3_secret_key # get pub host and path logger.info("Configured pub host & path: %s" % (pub_path_url)) # check scheme if not osaka.main.supported(pub_path_url): raise RuntimeError("Scheme %s is currently not supported." % urlparse(pub_path_url).scheme) # upload dataset to repo; track disk usage and start/end times of transfer prod_dir_usage = get_disk_usage(local_prod_path) tx_t1 = datetime.utcnow() if dry_run: logger.info("Would've published %s to %s" % (local_prod_path, pub_path_url)) else: publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name) orig_publ_ctx_file = publ_ctx_file + ".orig" try: publish_dataset( local_prod_path, pub_path_url, params=osaka_params, force=force, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url, ) except NoClobberPublishContextException as e: logger.warn( "A publish context file was found at {}. Retrieving.". format(publ_ctx_url)) osaka.main.get(publ_ctx_url, orig_publ_ctx_file, params=osaka_params) with open(orig_publ_ctx_file) as f: orig_publ_ctx = json.load(f) logger.warn("original publish context: {}".format( json.dumps(orig_publ_ctx, indent=2, sort_keys=True))) orig_payload_id = orig_publ_ctx.get("payload_id", None) orig_payload_hash = orig_publ_ctx.get("payload_hash", None) orig_task_id = orig_publ_ctx.get("task_id", None) logger.warn("orig payload_id: {}".format(orig_payload_id)) logger.warn("orig payload_hash: {}".format(orig_payload_hash)) logger.warn("orig task_id: {}".format(orig_payload_id)) if orig_payload_id is None: raise # overwrite if this job is a retry of the previous job if payload_id is not None and payload_id == orig_payload_id: msg = ( "This job is a retry of a previous job that resulted " + "in an orphaned dataset. Forcing publish.") logger.warn(msg) log_custom_event( "orphaned_dataset-retry_previous_failed", "clobber", { "orphan_info": { "payload_id": payload_id, "payload_hash": payload_hash, "task_id": task_id, "orig_payload_id": orig_payload_id, "orig_payload_hash": orig_payload_hash, "orig_task_id": orig_task_id, "dataset_id": objectid, "dataset_url": pub_path_url, "msg": msg, } }, ) else: job_status = get_job_status(orig_payload_id) logger.warn("orig job status: {}".format(job_status)) # overwrite if previous job failed if job_status == "job-failed": msg = ( "Detected previous job failure that resulted in an " + "orphaned dataset. Forcing publish.") logger.warn(msg) log_custom_event( "orphaned_dataset-job_failed", "clobber", { "orphan_info": { "payload_id": payload_id, "payload_hash": payload_hash, "task_id": task_id, "orig_payload_id": orig_payload_id, "orig_payload_hash": orig_payload_hash, "orig_task_id": orig_task_id, "orig_status": job_status, "dataset_id": objectid, "dataset_url": pub_path_url, "msg": msg, } }, ) else: # overwrite if dataset doesn't exist in grq if not dataset_exists(objectid): msg = "Detected orphaned dataset without ES doc. Forcing publish." logger.warn(msg) log_custom_event( "orphaned_dataset-no_es_doc", "clobber", { "orphan_info": { "payload_id": payload_id, "payload_hash": payload_hash, "task_id": task_id, "dataset_id": objectid, "dataset_url": pub_path_url, "msg": msg, } }, ) else: raise publish_dataset( local_prod_path, pub_path_url, params=osaka_params, force=True, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url, ) except osaka.utils.NoClobberException as e: if dataset_exists(objectid): try: osaka.main.rmall(publ_ctx_url, params=osaka_params) except: logger.warn( "Failed to clean up publish context {} after attempting to clobber valid dataset." .format(publ_ctx_url)) raise else: msg = "Detected orphaned dataset without ES doc. Forcing publish." logger.warn(msg) log_custom_event( "orphaned_dataset-no_es_doc", "clobber", { "orphan_info": { "payload_id": payload_id, "payload_hash": payload_hash, "task_id": task_id, "dataset_id": objectid, "dataset_url": pub_path_url, "msg": msg, } }, ) publish_dataset( local_prod_path, pub_path_url, params=osaka_params, force=True, publ_ctx_file=publ_ctx_file, publ_ctx_url=publ_ctx_url, ) tx_t2 = datetime.utcnow() tx_dur = (tx_t2 - tx_t1).total_seconds() # save dataset metrics on size and transfer prod_metrics.update({ "url": urlparse(pub_path_url).path, "disk_usage": prod_dir_usage, "time_start": tx_t1.isoformat() + "Z", "time_end": tx_t2.isoformat() + "Z", "duration": tx_dur, "transfer_rate": prod_dir_usage / tx_dur, }) else: logger.info("Dataset publish is not configured.") pub_urls = [] # publish browse if r.browseConfigured(): logger.info("Browse publish is configured.") # get browse path and urls browse_path = r.getBrowsePath() browse_urls = r.getBrowseUrls() # get S3 profile name and api keys for browse image publishing s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse") s3_profile_browse = r.getS3Profile("browse") # set osaka params for browse osaka_params_browse = {} # S3 profile takes precedence over explicit api keys if s3_profile_browse is not None: osaka_params_browse["profile_name"] = s3_profile_browse else: if s3_secret_key_browse is not None and s3_access_key_browse is not None: osaka_params_browse["aws_access_key_id"] = s3_access_key_browse osaka_params_browse[ "aws_secret_access_key"] = s3_secret_key_browse # add metadata for all browse images and upload to browse location imgs_metadata = [] imgs = glob("%s/*browse.png" % local_prod_path) for img in imgs: img_metadata = {"img": os.path.basename(img)} small_img = img.replace("browse.png", "browse_small.png") if os.path.exists(small_img): small_img_basename = os.path.basename(small_img) if browse_path is not None: this_browse_path = os.path.join(browse_path, small_img_basename) if dry_run: logger.info("Would've uploaded %s to %s" % (small_img, browse_path)) else: logger.info("Uploading %s to %s" % (small_img, browse_path)) osaka.main.put( small_img, this_browse_path, params=osaka_params_browse, noclobber=False, ) else: small_img_basename = None img_metadata["small_img"] = small_img_basename tooltip_match = BROWSE_RE.search(img_metadata["img"]) if tooltip_match: img_metadata["tooltip"] = tooltip_match.group(1) else: img_metadata["tooltip"] = "" imgs_metadata.append(img_metadata) # sort browse images browse_sort_order = r.getBrowseSortOrder() if isinstance(browse_sort_order, list) and len(browse_sort_order) > 0: bso_regexes = [re.compile(i) for i in browse_sort_order] sorter = {} unrecognized = [] for img in imgs_metadata: matched = None for i, bso_re in enumerate(bso_regexes): if bso_re.search(img["img"]): matched = img sorter[i] = matched break if matched is None: unrecognized.append(img) imgs_metadata = [sorter[i] for i in sorted(sorter)] imgs_metadata.extend(unrecognized) else: logger.info("Browse publish is not configured.") browse_urls = [] imgs_metadata = [] # set update json update_json = { "id": objectid, "objectid": objectid, "metadata": metadata, "dataset": ipath.split("/")[1], "ipath": ipath, "system_version": version, "dataset_level": level, "dataset_type": dtype, "urls": pub_urls, "browse_urls": browse_urls, "images": imgs_metadata, "prov": context.get("_prov", {}), } update_json.update(dataset) # logger.info("update_json: %s" % pformat(update_json)) # custom index specified? index = r.getIndex() if index is not None: update_json["index"] = index # update GRQ if dry_run: update_json["grq_index_result"] = {"index": index} logger.info("Would've indexed doc at %s: %s" % (grq_update_url, json.dumps(update_json, indent=2, sort_keys=True))) else: res = index_dataset(grq_update_url, update_json) logger.info("res: %s" % res) update_json["grq_index_result"] = res # finish if dry run if dry_run: try: shutil.rmtree(publ_ctx_dir) except: pass return (prod_metrics, update_json) # create PROV-ES JSON file for publish processStep prod_prov_es_file = os.path.join( local_prod_path, "%s.prov_es.json" % os.path.basename(local_prod_path)) pub_prov_es_bn = "publish.prov_es.json" if os.path.exists(prod_prov_es_file): pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn) prov_es_info = {} with open(prod_prov_es_file) as f: try: prov_es_info = json.load(f) except Exception as e: tb = traceback.format_exc() raise RuntimeError( "Failed to load PROV-ES from {}: {}\n{}".format( prod_prov_es_file, str(e), tb)) log_publish_prov_es( prov_es_info, pub_prov_es_file, local_prod_path, pub_urls, prod_metrics, objectid, ) # upload publish PROV-ES file osaka.main.put( pub_prov_es_file, os.path.join(pub_path_url, pub_prov_es_bn), params=osaka_params, noclobber=False, ) # cleanup publish context if publ_ctx_url is not None: try: osaka.main.rmall(publ_ctx_url, params=osaka_params) except: logger.warn( "Failed to clean up publish context at {} on successful publish." .format(publ_ctx_url)) try: shutil.rmtree(publ_ctx_dir) except: pass # queue data dataset queue_dataset(ipath, update_json, dataset_processed_queue) # return dataset metrics and dataset json return (prod_metrics, update_json)