Ejemplo n.º 1
0
    def setup_batch_jobs():
        if not ConfigParams().is_ci_context:
            with JobRegistry() as job_registry:
                job_registry.ensure_paths()

            job_tracker = JobTracker(JobRegistry, principal="", keytab="")
            threading.Thread(target=job_tracker.loop_update_statuses,
                             daemon=True).start()
Ejemplo n.º 2
0
 def cancel_job(self, job_id: str, user_id: str):
     with JobRegistry() as registry:
         application_id = registry.get_job(job_id,
                                           user_id)['application_id']
     # TODO: better logging of this kill.
     subprocess.run(
         ["yarn", "application", "-kill", application_id],
         timeout=20,
         check=True,
     )
Ejemplo n.º 3
0
 def create_job(self, user_id: str, job_specification: dict,
                api_version: str) -> BatchJobMetadata:
     job_id = str(uuid.uuid4())
     with JobRegistry() as registry:
         job_info = registry.register(job_id=job_id,
                                      user_id=user_id,
                                      api_version=api_version,
                                      specification=job_specification)
     return BatchJobMetadata(id=job_id,
                             process=job_specification,
                             status=job_info["status"],
                             created=parse_rfc3339(job_info["created"]))
def when_ready(server):
    print(server)
    from pyspark import SparkContext
    sc = SparkContext.getOrCreate()

    principal = sc.getConf().get("spark.yarn.principal")
    keytab = sc.getConf().get("spark.yarn.keytab")

    logging.getLogger('gunicorn.error').info('Gunicorn info logging enabled!')
    logging.getLogger('flask').info('Flask info logging enabled!')

    with JobRegistry() as job_registry:
        job_registry.ensure_paths()

    job_tracker = JobTracker(JobRegistry, principal, keytab)
    threading.Thread(target=job_tracker.update_statuses, daemon=True).start()
Ejemplo n.º 5
0
    def update_statuses(self) -> None:
        with self._job_registry() as registry:
            registry.ensure_paths()

            jobs_to_track = registry.get_running_jobs()

            for job_info in jobs_to_track:
                try:
                    job_id, user_id = job_info['job_id'], job_info['user_id']
                    application_id, current_status = job_info[
                        'application_id'], job_info['status']

                    if application_id:
                        try:
                            if ConfigParams().is_kube_deploy:
                                from openeogeotrellis.utils import s3_client, download_s3_dir
                                state, start_time, finish_time = JobTracker._kube_status(
                                    job_id, user_id)

                                new_status = JobTracker._kube_status_parser(
                                    state)

                                registry.patch(job_id,
                                               user_id,
                                               status=new_status,
                                               started=start_time,
                                               finished=finish_time)

                                if current_status != new_status:
                                    _log.info(
                                        "changed job %s status from %s to %s" %
                                        (job_id, current_status, new_status),
                                        extra={'job_id': job_id})

                                if state == "COMPLETED":
                                    # TODO: do we support SHub batch processes in this environment? The AWS
                                    #  credentials conflict.
                                    download_s3_dir(
                                        "OpenEO-data",
                                        "batch_jobs/{j}".format(j=job_id))

                                    result_metadata = self._batch_jobs.get_results_metadata(
                                        job_id, user_id)
                                    registry.patch(job_id, user_id,
                                                   **result_metadata)

                                    registry.mark_done(job_id, user_id)
                                    _log.info("marked %s as done" % job_id,
                                              extra={'job_id': job_id})
                            else:
                                state, final_state, start_time, finish_time, aggregate_resource_allocation =\
                                    JobTracker._yarn_status(application_id)

                                memory_time_megabyte_seconds, cpu_time_seconds =\
                                    JobTracker._parse_resource_allocation(aggregate_resource_allocation)

                                new_status = JobTracker._to_openeo_status(
                                    state, final_state)

                                registry.patch(
                                    job_id,
                                    user_id,
                                    status=new_status,
                                    started=JobTracker.
                                    _to_serializable_datetime(start_time),
                                    finished=JobTracker.
                                    _to_serializable_datetime(finish_time),
                                    memory_time_megabyte_seconds=
                                    memory_time_megabyte_seconds,
                                    cpu_time_seconds=cpu_time_seconds)

                                if current_status != new_status:
                                    _log.info(
                                        "changed job %s status from %s to %s" %
                                        (job_id, current_status, new_status),
                                        extra={'job_id': job_id})

                                if final_state != "UNDEFINED":
                                    result_metadata = self._batch_jobs.get_results_metadata(
                                        job_id, user_id)
                                    # TODO: skip patching the job znode and read from this file directly?
                                    registry.patch(job_id, user_id,
                                                   **result_metadata)

                                    if new_status == 'finished':
                                        registry.remove_dependencies(
                                            job_id, user_id)

                                        dependency_sources = JobRegistry.get_dependency_sources(
                                            job_info)

                                        if dependency_sources:
                                            async_task.schedule_delete_batch_process_dependency_sources(
                                                job_id, dependency_sources)

                                    registry.mark_done(job_id, user_id)

                                    _log.info("marked %s as done" % job_id,
                                              extra={
                                                  'job_id':
                                                  job_id,
                                                  'area':
                                                  result_metadata.get('area'),
                                                  'unique_process_ids':
                                                  result_metadata.get(
                                                      'unique_process_ids'),
                                                  'cpu_time_seconds':
                                                  cpu_time_seconds
                                              })
                        except JobTracker._UnknownApplicationIdException:
                            registry.mark_done(job_id, user_id)
                except Exception:
                    _log.warning(
                        "resuming with remaining jobs after failing to handle batch job {j}:\n{e}"
                        .format(j=job_id, e=traceback.format_exc()),
                        extra={'job_id': job_id})
                    registry.set_status(job_id, user_id, 'error')
                    registry.mark_done(job_id, user_id)
Ejemplo n.º 6
0
def main():
    import argparse

    logging.basicConfig(level=logging.INFO)
    openeogeotrellis.backend.logger.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(stream=sys.stdout)
    handler.formatter = JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S%z")

    root_logger = logging.getLogger()
    root_logger.addHandler(handler)

    _log.info("argv: {a!r}".format(a=sys.argv))
    _log.info("ConfigParams(): {c}".format(c=ConfigParams()))

    # FIXME: there's no Java output because Py4J redirects the JVM's stdout/stderr to /dev/null unless JavaGateway's
    #  redirect_stdout/redirect_stderr are set (EP-4018)

    try:
        parser = argparse.ArgumentParser(usage="OpenEO AsyncTask --task <task>",
                                         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        parser.add_argument("--py4j-jarpath", default="venv/share/py4j/py4j0.10.7.jar", help='Path to the Py4J jar')
        parser.add_argument("--py4j-classpath", default="geotrellis-extensions-2.2.0-SNAPSHOT.jar",
                            help='Classpath used to launch the Java Gateway')
        parser.add_argument("--principal", default="*****@*****.**", help="Principal to be used to login to KDC")
        parser.add_argument("--keytab", default="openeo-deploy/mep/openeo.keytab",
                            help="The full path to the file that contains the keytab for the principal")
        parser.add_argument("--task", required=True, dest="task_json", help="The task description in JSON")

        args = parser.parse_args()

        task = json.loads(args.task_json)
        task_id = task['task_id']
        if task_id not in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_POLL_SENTINELHUB_BATCH_PROCESSES,
                           TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]:
            raise ValueError(f'unsupported task_id "{task_id}"')

        arguments: dict = task.get('arguments', {})

        def batch_jobs() -> GpsBatchJobs:
            java_opts = [
                "-client",
                "-Dsoftware.amazon.awssdk.http.service.impl=software.amazon.awssdk.http.urlconnection.UrlConnectionSdkHttpService"
            ]

            java_gateway = JavaGateway.launch_gateway(jarpath=args.py4j_jarpath,
                                                      classpath=args.py4j_classpath,
                                                      javaopts=java_opts,
                                                      die_on_exit=True)

            return GpsBatchJobs(get_layer_catalog(opensearch_enrich=True), java_gateway.jvm, args.principal,
                                args.keytab)

        if task_id in [TASK_DELETE_BATCH_PROCESS_RESULTS, TASK_DELETE_BATCH_PROCESS_DEPENDENCY_SOURCES]:
            batch_job_id = arguments['batch_job_id']
            dependency_sources = (arguments.get('dependency_sources') or [f"s3://{sentinel_hub.OG_BATCH_RESULTS_BUCKET}/{subfolder}"
                                                                          for subfolder in arguments['subfolders']])

            _log.info(f"removing dependency sources {dependency_sources} for batch job {batch_job_id}...",
                      extra={'job_id': batch_job_id})
            batch_jobs().delete_batch_process_dependency_sources(job_id=batch_job_id,
                                                                 dependency_sources=dependency_sources,
                                                                 propagate_errors=True)
        elif task_id == TASK_POLL_SENTINELHUB_BATCH_PROCESSES:
            batch_job_id = arguments['batch_job_id']
            user_id = arguments['user_id']

            while True:
                time.sleep(SENTINEL_HUB_BATCH_PROCESSES_POLL_INTERVAL_S)

                with JobRegistry() as registry:
                    job_info = registry.get_job(batch_job_id, user_id)

                if job_info.get('dependency_status') not in ['awaiting', "awaiting_retry"]:
                    break
                else:
                    try:
                        batch_jobs().poll_sentinelhub_batch_processes(job_info)
                    except Exception:
                        # TODO: retry in Nifi? How to mark this job as 'error' then?
                        _log.error("failed to handle polling batch processes for batch job {j}:\n{e}"
                                   .format(j=batch_job_id, e=traceback.format_exc()),
                                   extra={'job_id': batch_job_id})

                        with JobRegistry() as registry:
                            registry.set_status(batch_job_id, user_id, 'error')
                            registry.mark_done(batch_job_id, user_id)

                        raise

        else:
            raise AssertionError(f'unexpected task_id "{task_id}"')
    except Exception as e:
        _log.error(e, exc_info=True)
        raise e
Ejemplo n.º 7
0
    def start_job(self, job_id: str, user_id: str):
        from pyspark import SparkContext

        with JobRegistry() as registry:
            job_info = registry.get_job(job_id, user_id)
            api_version = job_info.get('api_version')

            current_status = job_info['status']
            if current_status in ['queued', 'running']:
                return
            elif current_status != 'created':
                # TODO: is this about restarting a job?
                registry.mark_ongoing(job_id, user_id)
                registry.set_application_id(job_id, user_id, None)
                registry.set_status(job_id, user_id, 'created')

            spec = json.loads(job_info.get('specification'))
            extra_options = spec.get('job_options', {})

            driver_memory = extra_options.get("driver-memory", "22G")
            executor_memory = extra_options.get("executor-memory", "5G")

            kerberos()

            output_dir = self._get_job_output_dir(job_id)
            input_file = output_dir / "in"
            # TODO: how support multiple output files?
            output_file = output_dir / "out"
            log_file = output_dir / "log"

            with input_file.open('w') as f:
                f.write(job_info['specification'])

            conf = SparkContext.getOrCreate().getConf()
            principal, key_tab = conf.get("spark.yarn.principal"), conf.get(
                "spark.yarn.keytab")

            script_location = pkg_resources.resource_filename(
                'openeogeotrellis.deploy', 'submit_batch_job.sh')

            args = [
                script_location,
                "OpenEO batch job {j} user {u}".format(j=job_id, u=user_id),
                str(input_file),
                str(output_file),
                str(log_file)
            ]

            if principal is not None and key_tab is not None:
                args.append(principal)
                args.append(key_tab)
            else:
                args.append("no_principal")
                args.append("no_keytab")
            if api_version:
                args.append(api_version)
            else:
                args.append("0.4.0")

            args.append(driver_memory)
            args.append(executor_memory)

            try:
                output_string = subprocess.check_output(
                    args, stderr=subprocess.STDOUT, universal_newlines=True)
            except CalledProcessError as e:
                logger.exception(e)
                logger.error(e.stdout)
                logger.error(e.stderr)
                raise e

            try:
                # note: a job_id is returned as soon as an application ID is found in stderr, not when the job is finished
                logger.info(output_string)
                application_id = self._extract_application_id(output_string)
                print("mapped job_id %s to application ID %s" %
                      (job_id, application_id))

                registry.set_application_id(job_id, user_id, application_id)
            except _BatchJobError as e:
                traceback.print_exc(file=sys.stderr)
                # TODO: why reraise as CalledProcessError?
                raise CalledProcessError(1, str(args), output=output_string)
Ejemplo n.º 8
0
 def get_user_jobs(self, user_id: str) -> List[BatchJobMetadata]:
     with JobRegistry() as registry:
         return [
             self._parse_job_info(job_info)
             for job_info in registry.get_user_jobs(user_id)
         ]
Ejemplo n.º 9
0
 def get_job_info(self, job_id: str, user_id: str) -> BatchJobMetadata:
     with JobRegistry() as registry:
         job_info = registry.get_job(job_id, user_id)
     return self._parse_job_info(job_info)
Ejemplo n.º 10
0
 def setup_batch_jobs() -> None:
     with JobRegistry() as job_registry:
         job_registry.ensure_paths()
Ejemplo n.º 11
0
from openeogeotrellis.job_registry import JobRegistry
import datetime
import pandas as pd

with JobRegistry() as registry:
    jobs_before = registry.get_all_jobs_before(datetime.datetime.now())
    df = pd.DataFrame(jobs_before)
    df.created = pd.to_datetime(df.created)
    df.index = df.created
    print(df.status.unique())
    df = df[(df.status == 'finished') | (df.status == 'error') |
            (df.status == 'canceled')]
    df = df[(df.user_id != 'jenkins')
            & (df.user_id != 'geopyspark-integrationtester')]

    df['yearmonth'] = df.index.strftime('%Y%m')
    df['cpuhour'] = df.cpu_time_seconds / 3600.0
    df['cpuhour'] = df.cpu_time_seconds / 3600.0
    df['cost'] = df.cpuhour * 0.01
    df['memoryhour'] = df.memory_time_megabyte_seconds / (3600 * 1024)
    df['memorycost'] = df.memoryhour * 0.005
    df['totalcost'] = df.memorycost + df.cost

    cost_by_user_month = df.groupby(['user_id', 'yearmonth']).sum().cost
    cost_by_user_month = cost_by_user_month[cost_by_user_month > 1.0]
    memorycost_by_user_month = df.groupby(['user_id',
                                           'yearmonth']).sum().memorycost
    memorycost_by_user_month = memorycost_by_user_month[
        memorycost_by_user_month > 1.0]

    total_cost = (memorycost_by_user_month + cost_by_user_month).round()