Example #1
0
def ingest(objectid,
           dsets_file,
           grq_update_url,
           dataset_processed_queue,
           prod_path,
           job_path,
           dry_run=False,
           force=False):
    """Run dataset ingest."""
    logger.info("#" * 80)
    logger.info("datasets: %s" % dsets_file)
    logger.info("grq_update_url: %s" % grq_update_url)
    logger.info("dataset_processed_queue: %s" % dataset_processed_queue)
    logger.info("prod_path: %s" % prod_path)
    logger.info("job_path: %s" % job_path)
    logger.info("dry_run: %s" % dry_run)
    logger.info("force: %s" % force)

    # get default job path
    if job_path is None: job_path = os.getcwd()

    # detect job info
    job = {}
    job_json = os.path.join(job_path, '_job.json')
    if os.path.exists(job_json):
        with open(job_json) as f:
            try:
                job = json.load(f)
            except Exception, e:
                logger.warn("Failed to read job json:\n{}".format(str(e)))
Example #2
0
def offline_jobs(event):
    """Set job status to job-offline."""

    time_end = datetime.utcnow().isoformat() + "Z"
    query = {
        "query": {
            "bool": {
                "must": [
                    {"term": {"celery_hostname": event["hostname"]}},
                    {"term": {"status": "job-started"}},
                ]
            }
        }
    }
    logger.info("offline jobs query: %s" % json.dumps(query))
    uuids = []

    try:
        job_status_jsons = mozart_es.query(index="job_status-current", body=query)
        logger.info("Got {} jobs for {}.".format(len(job_status_jsons), event["hostname"]))

        for job_status in job_status_jsons:
            job_status_json = job_status['_source']
            uuid = job_status_json["uuid"]

            # offline the job only if it hasn't been picked up by another worker
            cur_job_status = get_val_via_socket(JOB_STATUS_KEY_TMPL % uuid)
            cur_job_worker = get_val_via_socket(TASK_WORKER_KEY_TMPL % uuid)
            logger.info("cur_job_status: {}".format(cur_job_status))
            logger.info("cur_job_worker: {}".format(cur_job_worker))

            if cur_job_status == "job-started" and cur_job_worker == event["hostname"]:
                job_status_json["status"] = "job-offline"
                job_status_json["error"] = "Received worker-offline event during job execution."
                job_status_json["short_error"] = "worker-offline"
                job_status_json.setdefault("job", {}).setdefault("job_info", {})["time_end"] = time_end
                log_job_status(job_status_json)
                logger.info("Offlined job with UUID %s" % uuid)
                uuids.append(uuid)
            else:
                logger.info("Not offlining job with UUID %s since real-time job status doesn't match" % uuid)
    except Exception as e:
        logger.warn("Got exception trying to update task events for offline worker %s: %s\n%s" % (
            event["hostname"], str(e), traceback.format_exc()))
Example #3
0
def check_dataset(id, es_index="grq"):
    """Query for dataset with specified input ID."""

    query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "term": {
                            "_id": id
                        }
                    },
                ]
            }
        },
        "fields": [],
    }
    es_url = app.conf['GRQ_ES_URL']
    if es_url.endswith('/'):
        search_url = '%s%s/_search' % (es_url, es_index)
    else:
        search_url = '%s/%s/_search' % (es_url, es_index)
    r = requests.post(search_url, data=json.dumps(query))
    if r.status_code == 200:
        result = r.json()
        total = result['hits']['total']
    else:
        logger.warn("Failed to query %s:\n%s" % (es_url, r.text))
        logger.warn("query: %s" % json.dumps(query, indent=2))
        logger.warn("returned: %s" % r.text)
        if r.status_code == 404: total = 0
        else: r.raise_for_status()
    return total
Example #4
0
        if dry_run:
            logger.info("Would've published %s to %s" %
                        (local_prod_path, pub_path_url))
        else:
            publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name)
            orig_publ_ctx_file = publ_ctx_file + '.orig'
            try:
                publish_dataset(local_prod_path,
                                pub_path_url,
                                params=osaka_params,
                                force=force,
                                publ_ctx_file=publ_ctx_file,
                                publ_ctx_url=publ_ctx_url)
            except NoClobberPublishContextException, e:
                logger.warn(
                    "A publish context file was found at {}. Retrieving.".
                    format(publ_ctx_url))
                osaka.main.get(publ_ctx_url,
                               orig_publ_ctx_file,
                               params=osaka_params)
                with open(orig_publ_ctx_file) as f:
                    orig_publ_ctx = json.load(f)
                logger.warn("original publish context: {}".format(
                    json.dumps(orig_publ_ctx, indent=2, sort_keys=True)))
                orig_payload_id = orig_publ_ctx.get('payload_id', None)
                orig_payload_hash = orig_publ_ctx.get('payload_hash', None)
                orig_task_id = orig_publ_ctx.get('task_id', None)
                logger.warn("orig payload_id: {}".format(orig_payload_id))
                logger.warn("orig payload_hash: {}".format(orig_payload_hash))
                logger.warn("orig task_id: {}".format(orig_payload_id))
Example #5
0
def ensure_image_loaded(image_name, image_url, cache_dir):
    """Pull docker image into local repo."""

    # check if image is in local docker repo
    try:
        registry = app.conf.get("CONTAINER_REGISTRY", None)
        # Custom edit to load image from registry
        try:
            if registry is not None:
                logger.info(
                    "Trying to load docker image {} from registry '{}'".format(
                        image_name, registry))
                registry_url = os.path.join(registry, image_name)
                logger.info("docker pull {}".format(registry_url))
                check_output(['docker', 'pull', registry_url])
                logger.info("docker tag {} {}".format(registry_url, image_name))
                check_output(['docker', 'tag', registry_url, image_name])
        except Exception as e:
            logger.warn(
                "Unable to load docker image from registry '{}': {}".format(
                    registry, e))

        image_info = check_output(['docker', 'inspect', image_name])
        logger.info("Docker image %s cached in repo" % image_name)
    except:
        logger.info("Failed to inspect docker image %s" % image_name)

        # pull image from url
        if image_url is not None:
            image_file = os.path.join(cache_dir, os.path.basename(image_url))
            if not os.path.exists(image_file):
                logger.info("Downloading image %s (%s) from %s" %
                            (image_file, image_name, image_url))
                try:
                    osaka.main.get(image_url, image_file)
                except Exception as e:
                    raise RuntimeError("Failed to download image {}:\n{}".format(image_url, str(e)))
                logger.info("Downloaded image %s (%s) from %s" %
                            (image_file, image_name, image_url))
            load_lock = "{}.load.lock".format(image_file)
            try:
                with atomic_write(load_lock) as f:
                    f.write("%sZ\n" % datetime.utcnow().isoformat())
                logger.info("Loading image %s (%s)" % (image_file, image_name))
                p = Popen(['docker', 'load', '-i', image_file],
                          stderr=PIPE, stdout=PIPE)
                stdout, stderr = p.communicate()
                if p.returncode != 0:
                    raise RuntimeError("Failed to load image {} ({}): {}".format(image_file, image_name, stderr.decode()))
                logger.info("Loaded image %s (%s)" % (image_file, image_name))
                try:
                    os.unlink(image_file)
                except:
                    pass
                try:
                    os.unlink(load_lock)
                except:
                    pass
            except OSError as e:
                if e.errno == 17:
                    logger.info("Waiting for image %s (%s) to load" %
                                (image_file, image_name))
                    inspect_image(image_name)
                else:
                    raise
        else:
            # pull image from docker hub
            logger.info("Pulling image %s from docker hub" % image_name)
            check_output(['docker', 'pull', image_name])
            logger.info("Pulled image %s from docker hub" % image_name)
        image_info = check_output(['docker', 'inspect', image_name])
    logger.info("image info for %s: %s" % (image_name, image_info.decode()))
    return json.loads(image_info)[0]
Example #6
0
def ingest(
    objectid,
    dsets_file,
    grq_update_url,
    dataset_processed_queue,
    prod_path,
    job_path,
    dry_run=False,
    force=False,
):
    """Run dataset ingest."""
    logger.info("#" * 80)
    logger.info("datasets: %s" % dsets_file)
    logger.info("grq_update_url: %s" % grq_update_url)
    logger.info("dataset_processed_queue: %s" % dataset_processed_queue)
    logger.info("prod_path: %s" % prod_path)
    logger.info("job_path: %s" % job_path)
    logger.info("dry_run: %s" % dry_run)
    logger.info("force: %s" % force)

    # get default job path
    if job_path is None:
        job_path = os.getcwd()

    # detect job info
    job = {}
    job_json = os.path.join(job_path, "_job.json")
    if os.path.exists(job_json):
        with open(job_json) as f:
            try:
                job = json.load(f)
            except Exception as e:
                logger.warn("Failed to read job json:\n{}".format(str(e)))
    task_id = job.get("task_id", None)
    payload_id = (job.get("job_info", {}).get("job_payload",
                                              {}).get("payload_task_id", None))
    payload_hash = job.get("job_info", {}).get("payload_hash", None)
    logger.info("task_id: %s" % task_id)
    logger.info("payload_id: %s" % payload_id)
    logger.info("payload_hash: %s" % payload_hash)

    # get dataset
    if os.path.isdir(prod_path):
        local_prod_path = prod_path
    else:
        local_prod_path = get_remote_dav(prod_path)
    if not os.path.isdir(local_prod_path):
        raise RuntimeError("Failed to find local dataset directory: %s" %
                           local_prod_path)

    # write publish context
    publ_ctx_name = "_publish.context.json"
    publ_ctx_dir = mkdtemp(prefix=".pub_context", dir=job_path)
    publ_ctx_file = os.path.join(publ_ctx_dir, publ_ctx_name)
    with open(publ_ctx_file, "w") as f:
        json.dump(
            {
                "payload_id": payload_id,
                "payload_hash": payload_hash,
                "task_id": task_id,
            },
            f,
            indent=2,
            sort_keys=True,
        )
    publ_ctx_url = None

    # dataset name
    pname = os.path.basename(local_prod_path)

    # dataset file
    dataset_file = os.path.join(local_prod_path, "%s.dataset.json" % pname)

    # get dataset json
    with open(dataset_file) as f:
        dataset = json.load(f)
    logger.info("Loaded dataset JSON from file: %s" % dataset_file)

    # check minimum requirements for dataset JSON
    logger.info("Verifying dataset JSON...")
    verify_dataset(dataset)
    logger.info("Dataset JSON verfication succeeded.")

    # get version
    version = dataset["version"]

    # recognize
    r = Recognizer(dsets_file, local_prod_path, objectid, version)

    # get extractor
    extractor = r.getMetadataExtractor()
    if extractor is not None:
        match = SCRIPT_RE.search(extractor)
        if match:
            extractor = match.group(1)
    logger.info("Configured metadata extractor: %s" % extractor)

    # metadata file
    metadata_file = os.path.join(local_prod_path, "%s.met.json" % pname)

    # metadata seed file
    seed_file = os.path.join(local_prod_path, "met.json")

    # metadata file already here
    if os.path.exists(metadata_file):
        with open(metadata_file) as f:
            metadata = json.load(f)
        logger.info("Loaded metadata from existing file: %s" % metadata_file)
    else:
        if extractor is None:
            logger.info(
                "No metadata extraction configured. Setting empty metadata.")
            metadata = {}
        else:
            logger.info("Running metadata extractor %s on %s" %
                        (extractor, local_prod_path))
            m = check_output([extractor, local_prod_path])
            logger.info("Output: %s" % m.decode())

            # generate json to update metadata and urls
            metadata = json.loads(m)

            # set data_product_name
            metadata["data_product_name"] = objectid

            # merge with seed metadata
            if os.path.exists(seed_file):
                with open(seed_file) as f:
                    seed = json.load(f)
                metadata.update(seed)
                logger.info("Loaded seed metadata from file: %s" % seed_file)

            # write it out to file
            with open(metadata_file, "w") as f:
                json.dump(metadata, f, indent=2)
            logger.info("Wrote metadata to %s" % metadata_file)

            # delete seed file
            if os.path.exists(seed_file):
                os.unlink(seed_file)
                logger.info("Deleted seed file %s." % seed_file)

    # read context
    context_file = os.path.join(local_prod_path, "%s.context.json" % pname)
    if os.path.exists(context_file):
        with open(context_file) as f:
            context = json.load(f)
        logger.info("Loaded context from existing file: %s" % context_file)
    else:
        context = {}

    # set metadata and dataset groups in recognizer
    r.setDataset(dataset)
    r.setMetadata(metadata)

    # get ipath
    ipath = r.getIpath()

    # get level
    level = r.getLevel()

    # get type
    dtype = r.getType()

    # set product metrics
    prod_metrics = {"ipath": ipath, "path": local_prod_path}

    # publish dataset
    if r.publishConfigured():
        logger.info("Dataset publish is configured.")

        # get publish path
        pub_path_url = r.getPublishPath()

        # get publish urls
        pub_urls = [i for i in r.getPublishUrls()]

        # get S3 profile name and api keys for dataset publishing
        s3_secret_key, s3_access_key = r.getS3Keys()
        s3_profile = r.getS3Profile()

        # set osaka params
        osaka_params = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile is not None:
            osaka_params["profile_name"] = s3_profile
        else:
            if s3_secret_key is not None and s3_access_key is not None:
                osaka_params["aws_access_key_id"] = s3_access_key
                osaka_params["aws_secret_access_key"] = s3_secret_key

        # get pub host and path
        logger.info("Configured pub host & path: %s" % (pub_path_url))

        # check scheme
        if not osaka.main.supported(pub_path_url):
            raise RuntimeError("Scheme %s is currently not supported." %
                               urlparse(pub_path_url).scheme)

        # upload dataset to repo; track disk usage and start/end times of transfer
        prod_dir_usage = get_disk_usage(local_prod_path)
        tx_t1 = datetime.utcnow()
        if dry_run:
            logger.info("Would've published %s to %s" %
                        (local_prod_path, pub_path_url))
        else:
            publ_ctx_url = os.path.join(pub_path_url, publ_ctx_name)
            orig_publ_ctx_file = publ_ctx_file + ".orig"
            try:
                publish_dataset(
                    local_prod_path,
                    pub_path_url,
                    params=osaka_params,
                    force=force,
                    publ_ctx_file=publ_ctx_file,
                    publ_ctx_url=publ_ctx_url,
                )
            except NoClobberPublishContextException as e:
                logger.warn(
                    "A publish context file was found at {}. Retrieving.".
                    format(publ_ctx_url))
                osaka.main.get(publ_ctx_url,
                               orig_publ_ctx_file,
                               params=osaka_params)
                with open(orig_publ_ctx_file) as f:
                    orig_publ_ctx = json.load(f)
                logger.warn("original publish context: {}".format(
                    json.dumps(orig_publ_ctx, indent=2, sort_keys=True)))
                orig_payload_id = orig_publ_ctx.get("payload_id", None)
                orig_payload_hash = orig_publ_ctx.get("payload_hash", None)
                orig_task_id = orig_publ_ctx.get("task_id", None)
                logger.warn("orig payload_id: {}".format(orig_payload_id))
                logger.warn("orig payload_hash: {}".format(orig_payload_hash))
                logger.warn("orig task_id: {}".format(orig_payload_id))

                if orig_payload_id is None:
                    raise

                # overwrite if this job is a retry of the previous job
                if payload_id is not None and payload_id == orig_payload_id:
                    msg = (
                        "This job is a retry of a previous job that resulted "
                        + "in an orphaned dataset. Forcing publish.")
                    logger.warn(msg)
                    log_custom_event(
                        "orphaned_dataset-retry_previous_failed",
                        "clobber",
                        {
                            "orphan_info": {
                                "payload_id": payload_id,
                                "payload_hash": payload_hash,
                                "task_id": task_id,
                                "orig_payload_id": orig_payload_id,
                                "orig_payload_hash": orig_payload_hash,
                                "orig_task_id": orig_task_id,
                                "dataset_id": objectid,
                                "dataset_url": pub_path_url,
                                "msg": msg,
                            }
                        },
                    )
                else:
                    job_status = get_job_status(orig_payload_id)
                    logger.warn("orig job status: {}".format(job_status))

                    # overwrite if previous job failed
                    if job_status == "job-failed":
                        msg = (
                            "Detected previous job failure that resulted in an "
                            + "orphaned dataset. Forcing publish.")
                        logger.warn(msg)
                        log_custom_event(
                            "orphaned_dataset-job_failed",
                            "clobber",
                            {
                                "orphan_info": {
                                    "payload_id": payload_id,
                                    "payload_hash": payload_hash,
                                    "task_id": task_id,
                                    "orig_payload_id": orig_payload_id,
                                    "orig_payload_hash": orig_payload_hash,
                                    "orig_task_id": orig_task_id,
                                    "orig_status": job_status,
                                    "dataset_id": objectid,
                                    "dataset_url": pub_path_url,
                                    "msg": msg,
                                }
                            },
                        )
                    else:
                        # overwrite if dataset doesn't exist in grq
                        if not dataset_exists(objectid):
                            msg = "Detected orphaned dataset without ES doc. Forcing publish."
                            logger.warn(msg)
                            log_custom_event(
                                "orphaned_dataset-no_es_doc",
                                "clobber",
                                {
                                    "orphan_info": {
                                        "payload_id": payload_id,
                                        "payload_hash": payload_hash,
                                        "task_id": task_id,
                                        "dataset_id": objectid,
                                        "dataset_url": pub_path_url,
                                        "msg": msg,
                                    }
                                },
                            )
                        else:
                            raise
                publish_dataset(
                    local_prod_path,
                    pub_path_url,
                    params=osaka_params,
                    force=True,
                    publ_ctx_file=publ_ctx_file,
                    publ_ctx_url=publ_ctx_url,
                )
            except osaka.utils.NoClobberException as e:
                if dataset_exists(objectid):
                    try:
                        osaka.main.rmall(publ_ctx_url, params=osaka_params)
                    except:
                        logger.warn(
                            "Failed to clean up publish context {} after attempting to clobber valid dataset."
                            .format(publ_ctx_url))
                    raise
                else:
                    msg = "Detected orphaned dataset without ES doc. Forcing publish."
                    logger.warn(msg)
                    log_custom_event(
                        "orphaned_dataset-no_es_doc",
                        "clobber",
                        {
                            "orphan_info": {
                                "payload_id": payload_id,
                                "payload_hash": payload_hash,
                                "task_id": task_id,
                                "dataset_id": objectid,
                                "dataset_url": pub_path_url,
                                "msg": msg,
                            }
                        },
                    )
                    publish_dataset(
                        local_prod_path,
                        pub_path_url,
                        params=osaka_params,
                        force=True,
                        publ_ctx_file=publ_ctx_file,
                        publ_ctx_url=publ_ctx_url,
                    )
        tx_t2 = datetime.utcnow()
        tx_dur = (tx_t2 - tx_t1).total_seconds()

        # save dataset metrics on size and transfer
        prod_metrics.update({
            "url": urlparse(pub_path_url).path,
            "disk_usage": prod_dir_usage,
            "time_start": tx_t1.isoformat() + "Z",
            "time_end": tx_t2.isoformat() + "Z",
            "duration": tx_dur,
            "transfer_rate": prod_dir_usage / tx_dur,
        })
    else:
        logger.info("Dataset publish is not configured.")
        pub_urls = []

    # publish browse
    if r.browseConfigured():
        logger.info("Browse publish is configured.")

        # get browse path and urls
        browse_path = r.getBrowsePath()
        browse_urls = r.getBrowseUrls()

        # get S3 profile name and api keys for browse image publishing
        s3_secret_key_browse, s3_access_key_browse = r.getS3Keys("browse")
        s3_profile_browse = r.getS3Profile("browse")

        # set osaka params for browse
        osaka_params_browse = {}

        # S3 profile takes precedence over explicit api keys
        if s3_profile_browse is not None:
            osaka_params_browse["profile_name"] = s3_profile_browse
        else:
            if s3_secret_key_browse is not None and s3_access_key_browse is not None:
                osaka_params_browse["aws_access_key_id"] = s3_access_key_browse
                osaka_params_browse[
                    "aws_secret_access_key"] = s3_secret_key_browse

        # add metadata for all browse images and upload to browse location
        imgs_metadata = []
        imgs = glob("%s/*browse.png" % local_prod_path)
        for img in imgs:
            img_metadata = {"img": os.path.basename(img)}
            small_img = img.replace("browse.png", "browse_small.png")
            if os.path.exists(small_img):
                small_img_basename = os.path.basename(small_img)
                if browse_path is not None:
                    this_browse_path = os.path.join(browse_path,
                                                    small_img_basename)
                    if dry_run:
                        logger.info("Would've uploaded %s to %s" %
                                    (small_img, browse_path))
                    else:
                        logger.info("Uploading %s to %s" %
                                    (small_img, browse_path))
                        osaka.main.put(
                            small_img,
                            this_browse_path,
                            params=osaka_params_browse,
                            noclobber=False,
                        )
            else:
                small_img_basename = None
            img_metadata["small_img"] = small_img_basename
            tooltip_match = BROWSE_RE.search(img_metadata["img"])
            if tooltip_match:
                img_metadata["tooltip"] = tooltip_match.group(1)
            else:
                img_metadata["tooltip"] = ""
            imgs_metadata.append(img_metadata)

        # sort browse images
        browse_sort_order = r.getBrowseSortOrder()
        if isinstance(browse_sort_order, list) and len(browse_sort_order) > 0:
            bso_regexes = [re.compile(i) for i in browse_sort_order]
            sorter = {}
            unrecognized = []
            for img in imgs_metadata:
                matched = None
                for i, bso_re in enumerate(bso_regexes):
                    if bso_re.search(img["img"]):
                        matched = img
                        sorter[i] = matched
                        break
                if matched is None:
                    unrecognized.append(img)
            imgs_metadata = [sorter[i] for i in sorted(sorter)]
            imgs_metadata.extend(unrecognized)
    else:
        logger.info("Browse publish is not configured.")
        browse_urls = []
        imgs_metadata = []

    # set update json
    update_json = {
        "id": objectid,
        "objectid": objectid,
        "metadata": metadata,
        "dataset": ipath.split("/")[1],
        "ipath": ipath,
        "system_version": version,
        "dataset_level": level,
        "dataset_type": dtype,
        "urls": pub_urls,
        "browse_urls": browse_urls,
        "images": imgs_metadata,
        "prov": context.get("_prov", {}),
    }
    update_json.update(dataset)
    # logger.info("update_json: %s" % pformat(update_json))

    # custom index specified?
    index = r.getIndex()
    if index is not None:
        update_json["index"] = index

    # update GRQ
    if dry_run:
        update_json["grq_index_result"] = {"index": index}
        logger.info("Would've indexed doc at %s: %s" %
                    (grq_update_url,
                     json.dumps(update_json, indent=2, sort_keys=True)))
    else:
        res = index_dataset(grq_update_url, update_json)
        logger.info("res: %s" % res)
        update_json["grq_index_result"] = res

    # finish if dry run
    if dry_run:
        try:
            shutil.rmtree(publ_ctx_dir)
        except:
            pass
        return (prod_metrics, update_json)

    # create PROV-ES JSON file for publish processStep
    prod_prov_es_file = os.path.join(
        local_prod_path, "%s.prov_es.json" % os.path.basename(local_prod_path))
    pub_prov_es_bn = "publish.prov_es.json"
    if os.path.exists(prod_prov_es_file):
        pub_prov_es_file = os.path.join(local_prod_path, pub_prov_es_bn)
        prov_es_info = {}
        with open(prod_prov_es_file) as f:
            try:
                prov_es_info = json.load(f)
            except Exception as e:
                tb = traceback.format_exc()
                raise RuntimeError(
                    "Failed to load PROV-ES from {}: {}\n{}".format(
                        prod_prov_es_file, str(e), tb))
        log_publish_prov_es(
            prov_es_info,
            pub_prov_es_file,
            local_prod_path,
            pub_urls,
            prod_metrics,
            objectid,
        )
        # upload publish PROV-ES file
        osaka.main.put(
            pub_prov_es_file,
            os.path.join(pub_path_url, pub_prov_es_bn),
            params=osaka_params,
            noclobber=False,
        )

    # cleanup publish context
    if publ_ctx_url is not None:
        try:
            osaka.main.rmall(publ_ctx_url, params=osaka_params)
        except:
            logger.warn(
                "Failed to clean up publish context at {} on successful publish."
                .format(publ_ctx_url))
    try:
        shutil.rmtree(publ_ctx_dir)
    except:
        pass

    # queue data dataset
    queue_dataset(ipath, update_json, dataset_processed_queue)

    # return dataset metrics and dataset json
    return (prod_metrics, update_json)