Esempio n. 1
0
    def __init__(self, args, environ):
        self.args = args

        # Fetch all required environment variables, exiting if unset.
        self.environ = sys_util.copy_from_env(
            ["CROMWELL", "CROMWELL_CONF", "JVM_OPTS"], environ)
        cromwell_conf = self.environ["CROMWELL_CONF"]
        cromwell_jar = self.environ["CROMWELL"]
        raw_jvm_flags = self.environ["JVM_OPTS"]
        jvm_flags = None
        if raw_jvm_flags:
            jvm_flags = raw_jvm_flags.split(" ")

        # Verify that the output directory is empty (or not there).
        if self.args.output_dir and not file_util.verify_gcs_dir_empty_or_missing(
                self.args.output_dir):
            sys_util.exit_with_error("Output directory not empty: %s" %
                                     self.args.output_dir)

        # Plug in the working directory and the project id to the Cromwell conf
        self.fill_cromwell_conf(cromwell_conf, self.args.working_dir,
                                self.args.project)

        # Set up the Cromwell driver
        self.driver = cromwell_driver.CromwellDriver(cromwell_conf,
                                                     cromwell_jar, jvm_flags)
        self.driver.start()
Esempio n. 2
0
def verify_gcs_dir_empty_or_missing(path):
    """Verify that the output "directory" does not exist or is empty."""

    # Use the storage API directly instead of gsutil.
    # gsutil does not return explicit error codes and so to detect
    # a non-existent path would require capturing and parsing the error message.

    # Verify the input is a GCS path
    if not path.startswith('gs://'):
        sys_util.exit_with_error("Path is not a GCS path: '%s'" % path)

    # Tokenize the path into bucket and prefix
    parts = path[len('gs://'):].split('/', 1)
    bucket = parts[0]
    prefix = parts[1] if len(parts) > 1 else None

    # Get the storage endpoint
    credentials = GoogleCredentials.get_application_default()
    service = discovery.build('storage',
                              'v1',
                              credentials=credentials,
                              cache_discovery=False)

    # Build the request - only need the name
    fields = 'nextPageToken,items(name)'
    request = service.objects().list(bucket=bucket,
                                     prefix=prefix,
                                     fields=fields,
                                     maxResults=2)

    # If we get more than 1 item, we are done (directory not empty)
    # If we get zero items, we are done (directory empty)
    # If we get 1 item, then we need to check if it is a "directory object"

    items = []
    while request and len(items) < 2:
        try:
            response = request.execute()
        except HttpError as err:
            error = json.loads(err.content)
            error = error['error']

            sys_util.exit_with_error("%s %s: '%s'" %
                                     (error['code'], error['message'], path))

        items.extend(response.get('items', []))
        request = service.objects().list_next(request, response)

    if not items:
        return True

    if len(items) == 1 and items[0]['name'].rstrip('/') == prefix.rstrip('/'):
        return True

    return False
def verify_gcs_dir_empty_or_missing(path):
  """Verify that the output "directory" does not exist or is empty."""

  # Use the storage API directly instead of gsutil.
  # gsutil does not return explicit error codes and so to detect
  # a non-existent path would require capturing and parsing the error message.

  # Verify the input is a GCS path
  if not path.startswith('gs://'):
    sys_util.exit_with_error("Path is not a GCS path: '%s'" % path)

  # Tokenize the path into bucket and prefix
  parts = path[len('gs://'):].split('/', 1)
  bucket = parts[0]
  prefix = parts[1] if len(parts) > 1 else None

  # Get the storage endpoint
  credentials = GoogleCredentials.get_application_default()
  service = discovery.build('storage', 'v1', credentials=credentials,
                            cache_discovery=False)

  # Build the request - only need the name
  fields = 'nextPageToken,items(name)'
  request = service.objects().list(
      bucket=bucket, prefix=prefix, fields=fields, maxResults=2)

  # If we get more than 1 item, we are done (directory not empty)
  # If we get zero items, we are done (directory empty)
  # If we get 1 item, then we need to check if it is a "directory object"

  items = []
  while request and len(items) < 2:
    try:
      response = request.execute()
    except HttpError as err:
      error = simplejson.loads(err.content)
      error = error['error']

      sys_util.exit_with_error(
          "%s %s: '%s'" % (error['code'], error['message'], path))

    items.extend(response.get('items', []))
    request = service.objects().list_next(request, response)

  if not items:
    return True

  if len(items) == 1 and items[0]['name'].rstrip('/') == prefix.rstrip('/'):
    return True

  return False
Esempio n. 4
0
def gsutil_cp(source_files, dest_dir):
    """Copies files to GCS and exits on error."""

    cp_cmd = ['gsutil', 'cp'] + source_files + [dest_dir]

    logging.info("Copying %s to %s", source_files, dest_dir)

    # Copies can fail, so include retries...
    for attempt in range(3):
        p = subprocess.Popen(cp_cmd, stderr=subprocess.PIPE)
        return_code = p.wait()
        if not return_code:
            return

        logging.warn("Copy %s to %s failed: attempt %d", source_files,
                     dest_dir, attempt)

    sys_util.exit_with_error("copying files from %s to %s failed: %s" %
                             (source_files, dest_dir, p.stderr.read()))
def gsutil_cp(source_files, dest_dir):
  """Copies files to GCS and exits on error."""

  cp_cmd = ['gsutil', 'cp'] + source_files + [dest_dir]

  logging.info("Copying %s to %s", source_files, dest_dir)

  # Copies can fail, so include retries...
  for attempt in range(3):
    p = subprocess.Popen(cp_cmd, stderr=subprocess.PIPE)
    return_code = p.wait()
    if not return_code:
      return

    logging.warn("Copy %s to %s failed: attempt %d",
                 source_files, dest_dir, attempt)

  sys_util.exit_with_error(
      "copying files from %s to %s failed: %s" % (
          source_files, dest_dir, p.stderr.read()))
  def __init__(self, args, environ):
    self.args = args

    # Fetch all required environment variables, exiting if unset.
    self.environ = sys_util.copy_from_env(
        ['CROMWELL', 'CROMWELL_CONF'], environ)
    cromwell_conf = self.environ['CROMWELL_CONF']
    cromwell_jar = self.environ['CROMWELL']

    # Verify that the output directory is empty (or not there).
    if not file_util.verify_gcs_dir_empty_or_missing(self.args.output_dir):
      sys_util.exit_with_error(
          "Output directory not empty: %s" % self.args.output_dir)

    # Plug in the working directory and the project id to the Cromwell conf
    self.fill_cromwell_conf(cromwell_conf,
                            self.args.working_dir, self.args.project)

    # Set up the Cromwell driver
    self.driver = cromwell_driver.CromwellDriver(cromwell_conf, cromwell_jar)
    self.driver.start()
Esempio n. 7
0
    def __init__(self, args, environ):
        self.args = args

        # Fetch all required environment variables, exiting if unset.
        self.environ = sys_util.copy_from_env(['CROMWELL', 'CROMWELL_CONF'],
                                              environ)
        cromwell_conf = self.environ['CROMWELL_CONF']
        cromwell_jar = self.environ['CROMWELL']

        # Verify that the output directory is empty (or not there).
        if not file_util.verify_gcs_dir_empty_or_missing(self.args.output_dir):
            sys_util.exit_with_error("Output directory not empty: %s" %
                                     self.args.output_dir)

        # Plug in the working directory and the project id to the Cromwell conf
        self.fill_cromwell_conf(cromwell_conf, self.args.working_dir,
                                self.args.project)

        # Set up the Cromwell driver
        self.driver = cromwell_driver.CromwellDriver(cromwell_conf,
                                                     cromwell_jar)
        self.driver.start()
Esempio n. 8
0
    def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15):
        """Post new job to the server and poll for completion."""

        # Add required input files
        with open(wdl, 'r') as f:
            wdl_source = f.read()
        with open(workflow_inputs, 'r') as f:
            wf_inputs = f.read()

        files = {
            'wdlSource': wdl_source,
            'workflowInputs': wf_inputs,
        }

        # Add workflow options if specified
        if workflow_options:
            with open(workflow_options, 'r') as f:
                wf_options = f.read()
                files['workflowOptions'] = wf_options

        # After Cromwell start, it may take a few seconds to be ready for requests.
        # Poll up to a minute for successful connect and submit.

        job = None
        max_time_wait = 60
        wait_interval = 5

        time.sleep(wait_interval)
        for attempt in range(max_time_wait / wait_interval):
            try:
                job = self.fetch(session, post=True, files=files)
                break
            except requests.exceptions.ConnectionError as e:
                logging.info("Failed to connect to Cromwell (attempt %d): %s",
                             attempt + 1, e)
                time.sleep(wait_interval)

        if not job:
            sys_util.exit_with_error(
                "Failed to connect to Cromwell after {0} seconds".format(
                    max_time_wait))

        if job['status'] != 'Submitted':
            sys_util.exit_with_error(
                "Job status from Cromwell was not 'Submitted', instead '{0}'".
                format(job['status']))

        # Job is running.
        cromwell_id = job['id']
        logging.info("Job submitted to Cromwell. job id: %s", cromwell_id)

        # Poll Cromwell for job completion.
        attempt = 0
        max_failed_attempts = 3
        while True:
            time.sleep(sleep_time)

            # Cromwell occassionally fails to respond to the status request.
            # Only give up after 3 consecutive failed requests.
            try:
                status_json = self.fetch(session,
                                         wf_id=cromwell_id,
                                         method='status')
                attempt = 0
            except requests.exceptions.ConnectionError as e:
                attempt += 1
                logging.info(
                    "Error polling Cromwell job status (attempt %d): %s",
                    attempt, e)

                if attempt >= max_failed_attempts:
                    sys_util.exit_with_error(
                        "Cromwell did not respond for %d consecutive requests"
                        % attempt)

                continue

            status = status_json['status']
            if status == 'Succeeded':
                break
            elif status == 'Submitted':
                pass
            elif status == 'Running':
                pass
            else:
                sys_util.exit_with_error(
                    "Status of job is not Submitted, Running, or Succeeded: %s"
                    % status)

        logging.info("Cromwell job status: %s", status)

        # Cromwell produces a list of outputs and full job details
        outputs = self.fetch(session, wf_id=cromwell_id, method='outputs')
        metadata = self.fetch(session, wf_id=cromwell_id, method='metadata')

        return outputs, metadata
Esempio n. 9
0
    def batch(self, submission_id, wdl, inputs, options, batch_limit,
              query_limit):
        logging.info(
            "Starting batch request. Waiting for cromwell to start...")
        self.logger.log(
            "Beginning batch request",
            batch_limit=batch_limit,
            query_limit=query_limit,
        )
        time.sleep(60)
        with open(wdl, 'r') as wdlReader:
            with open(options, 'r') as optionReader:
                opts = json.load(optionReader)
                opts['google_labels'] = {
                    'lapdog-submission-id': 'id-' + submission_id,
                    'lapdog-execution-role': 'worker'
                }
                data = {
                    'workflowSource': wdlReader.read(),
                    # 'workflowInputs': json.dumps([line for line in reader]),
                    'workflowOptions': json.dumps(opts),
                }
        logging.info("Starting the following configuration: " +
                     json.dumps(data))
        output = []
        first = True
        with open(inputs, 'r') as inputReader:
            reader = csv.DictReader(inputReader,
                                    delimiter='\t',
                                    lineterminator='\n')
            with requests.Session() as session:
                for batch in clump(reader, batch_limit):
                    self.check_cromwell()
                    logging.info("Running a new batch of %d workflows" %
                                 batch_limit)

                    chunk = []

                    if not first:
                        logging.info("Restarting cromwell...")
                        self.cromwell_proc.kill()
                        self.cromwell_proc = None
                        time.sleep(10)
                        self.start(self.mem)
                        time.sleep(20)
                        logging.info("Resuming next batch")
                    else:
                        first = False

                    for group in clump(batch, query_limit):
                        logging.info("Starting a chunk of %d workflows" %
                                     query_limit)
                        group = [line for line in group]
                        logging.info("There are %d workflows in this group" %
                                     len(group))
                        response = None
                        for attempt in range(10):
                            try:
                                data['workflowInputs'] = json.dumps(
                                    [unpack(line) for line in group])
                                self.logger.log('Launching workflow batch',
                                                json=data)
                                response = session.post(
                                    'http://localhost:8000/api/workflows/v1/batch',
                                    files=data)
                                response = response.json()
                                logging.info("Submitted jobs. Begin polling")
                                break
                            except requests.exceptions.ConnectionError as e:
                                self.logger.log_exception()
                                traceback.print_exc()
                                self.check_cromwell()
                                logging.info(
                                    "Failed to connect to Cromwell (attempt %d): %s",
                                    attempt + 1, e)
                                time.sleep(30)
                            except ValueError:
                                self.logger.log_exception(
                                    "JSON Decode error",
                                    response=response.text
                                    if response is not None else None,
                                )
                                traceback.print_exc()
                                self.check_cromwell()
                                logging.error(
                                    "Unexpected response from Cromwell: (%d) : %s"
                                    % (response.status_code, response.text))
                                raise

                        if not response:
                            self.check_cromwell()
                            self.logging.log("Cromwell timeout",
                                             severity="WARNING")
                            sys_util.exit_with_error(
                                "Failed to connect to Cromwell after {0} seconds"
                                .format(300))

                        logging.info("Raw response: " + repr(response))

                        for job in response:
                            if job['status'] != 'Submitted' and job[
                                    'status'] != 'Running':
                                for job in response:
                                    self.abort(job['id'])
                                self.logging.log('Unexpected job status',
                                                 status=job['status'],
                                                 jobs=response,
                                                 severity='ERROR')
                                sys_util.exit_with_error(
                                    "Job {} status from Cromwell was not 'Submitted', instead '{}'"
                                    .format(job['id'], job['status']))
                            else:
                                chunk.append(job)

                    self.batch_submission = True
                    self.check_cromwell()

                    @atexit.register
                    def abort_all_jobs():
                        if self.batch_submission:
                            for job in response:
                                self.abort(job['id'])

                    for i in range(12):
                        time.sleep(5)

                    attempt = 0
                    max_failed_attempts = 3
                    known_failures = set()
                    while True:
                        for i in range(3):
                            time.sleep(10)

                        self.check_cromwell()

                        # Cromwell occassionally fails to respond to the status request.
                        # Only give up after 3 consecutive failed requests.
                        try:
                            status_json = [[
                                self.fetch(session,
                                           wf_id=job['id'],
                                           method='status'),
                                time.sleep(0.1)
                            ][0] for job in chunk]
                            attempt = 0
                        except requests.exceptions.ConnectionError as e:
                            self.logger.log_exception()
                            attempt += 1
                            logging.info(
                                "Error polling Cromwell job status (attempt %d): %s",
                                attempt, e)
                            self.check_cromwell()

                            if attempt >= max_failed_attempts:
                                self.logger.log(
                                    'Cromwell crash with active workflows',
                                    jobs=chunk,
                                    severity='WARNING')
                                sys_util.exit_with_error(
                                    "Cromwell did not respond for %d consecutive requests"
                                    % attempt)

                            continue

                        statuses = {job['status'] for job in status_json}
                        # logging.info("<WORKFLOW STATUS UPDATE> %s" % json.dumps(status_json))
                        if 'Failed' in statuses:
                            new_failures = [
                                job for job in status_json
                                if job['status'] == 'Failed'
                                and job['id'] not in known_failures
                            ]
                            if len(new_failures):
                                sys.stderr.write(
                                    "The following jobs failed: %s\n" %
                                    (', '.join('%s (%s)' %
                                               (job['id'], job['status'])
                                               for job in new_failures)))
                            known_failures |= {
                                job['id']
                                for job in new_failures
                            }
                        if not len(statuses -
                                   {'Succeeded', 'Failed', 'Aborted'}):
                            logging.info("All workflows in terminal states")
                            self.logger.log(
                                'Batch complete',
                                json=status_json,
                            )
                            break

                    self.batch_submission = False

                    output += [{
                        'workflow_id':
                        job['id'],
                        'workflow_status':
                        job['status'],
                        'workflow_output':
                        self.fetch(session, wf_id=job['id'], method='outputs')
                        if job['status'] == 'Succeeded' else None,
                        'workflow_metadata':
                        self.fetch(session, wf_id=job['id'], method='metadata')
                        if job['status'] == 'Succeeded' else None,
                    } for job in status_json]

                    self.check_cromwell()

                    if 'Aborted' in statuses:
                        # Quit now. No reason to start a new batch to get aborted
                        self.logger.log('Submission aborted', json=output)
                        sys.stderr.write(
                            "There were aborted workflows. Aborting submission now."
                        )
                        return output
        logging.info("<SUBMISSION COMPLETE. FINALIZING DATA>")
        self.logger.log('Submission complete. Finalizing data', json=output)
        return output
  def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15):
    """Post new job to the server and poll for completion."""

    # Add required input files
    with open(wdl, 'rb') as f:
      wdl_source = f.read()
    with open(workflow_inputs, 'rb') as f:
      wf_inputs = f.read()

    files = {
        'wdlSource': wdl_source,
        'workflowInputs': wf_inputs,
    }

    # Add workflow options if specified
    if workflow_options:
      with open(workflow_options, 'rb') as f:
        wf_options = f.read()
        files['workflowOptions'] = wf_options

    # After Cromwell start, it may take a few seconds to be ready for requests.
    # Poll up to a minute for successful connect and submit.

    job = None
    max_time_wait = 60
    wait_interval = 5

    time.sleep(wait_interval)
    for attempt in range(max_time_wait/wait_interval):
      try:
        job = self.fetch(post=True, files=files)
        break
      except requests.exceptions.ConnectionError as e:
        logging.info("Failed to connect to Cromwell (attempt %d): %s",
          attempt + 1, e)
        time.sleep(wait_interval)

    if not job:
      sys_util.exit_with_error(
          "Failed to connect to Cromwell after {0} seconds".format(
              max_time_wait))

    if job['status'] != 'Submitted':
      sys_util.exit_with_error(
          "Job status from Cromwell was not 'Submitted', instead '{0}'".format(
              job['status']))

    # Job is running.
    cromwell_id = job['id']
    logging.info("Job submitted to Cromwell. job id: %s", cromwell_id)

    # Poll Cromwell for job completion.
    attempt = 0
    max_failed_attempts = 3
    while True:
      time.sleep(sleep_time)

      # Cromwell occassionally fails to respond to the status request.
      # Only give up after 3 consecutive failed requests.
      try:
        status_json = self.fetch(wf_id=cromwell_id, method='status')
        attempt = 0
      except requests.exceptions.ConnectionError as e:
        attempt += 1
        logging.info("Error polling Cromwell job status (attempt %d): %s",
          attempt, e)

        if attempt >= max_failed_attempts:
          sys_util.exit_with_error(
            "Cromwell did not respond for %d consecutive requests" % attempt)

        continue

      status = status_json['status']
      if status == 'Succeeded':
        break
      elif status == 'Submitted':
        pass
      elif status == 'Running':
        pass
      else:
        sys_util.exit_with_error(
            "Status of job is not Submitted, Running, or Succeeded: %s" % status)

    logging.info("Cromwell job status: %s", status)

    # Cromwell produces a list of outputs and full job details
    outputs = self.fetch(wf_id=cromwell_id, method='outputs')
    metadata = self.fetch(wf_id=cromwell_id, method='metadata')

    return outputs, metadata
  def submit(self, wdl, workflow_inputs, workflow_options, sleep_time=15):
    """Post new job to the server and poll for completion."""

    # Add required input files
    with open(wdl, 'rb') as f:
      wdl_source = f.read()
    with open(workflow_inputs, 'rb') as f:
      wf_inputs = f.read()

    files = {
        'wdlSource': wdl_source,
        'workflowInputs': wf_inputs,
    }

    # Add workflow options if specified
    if workflow_options:
      with open(workflow_options, 'rb') as f:
        wf_options = f.read()
        files['workflowOptions'] = wf_options

    # After Cromwell start, it may take a few seconds to be ready for requests.
    # Try up to a minute to connect.
    job = None
    max_time_wait = 60
    wait_interval = 5
    for attempt in range(max_time_wait/wait_interval):
      try:
        job = self.fetch(post=True, files=files)
        break
      except requests.exceptions.ConnectionError as e:
        logging.info("Failed to connect to Cromwell(%d): %s", attempt, e)
        time.sleep(wait_interval)

    if not job:
      sys_util.exit_with_error(
          "Failed to connect to Cromwell after {0} seconds".format(
              max_time_wait))

    if job['status'] != 'Submitted':
      sys_util.exit_with_error(
          "Job status from Cromwell was not 'Submitted', instead '{0}'".format(
              job['status']))

    # Job is running.
    cromwell_id = job['id']
    logging.info("Cromwell job id: %s", cromwell_id)

    # Poll for completion.
    while True:
      time.sleep(sleep_time)
      status_json = self.fetch(wf_id=cromwell_id, method='status')

      status = status_json['status']
      if status == 'Succeeded':
        break
      elif status == 'Running':
        pass
      else:
        sys_util.exit_with_error(
            'Status of job is not Running or Succeeded: %s' % status)

    logging.info("Succeeded")

    # Cromwell produces a list of outputs and full job details
    outputs = self.fetch(wf_id=cromwell_id, method='outputs')
    metadata = self.fetch(wf_id=cromwell_id, method='metadata')

    return outputs, metadata
Esempio n. 12
0
    def submit(
        self,
        wdl,
        workflow_inputs,
        workflow_options,
        workflow_dependencies,
        sleep_time=15,
    ):
        """Post new job to the server and poll for completion."""

        # Add required input files
        with open(wdl, "rb") as f:
            wf_source = f.read()
        with open(workflow_inputs, "rb") as f:
            wf_inputs = f.read()

        files = {
            "workflowSource": wf_source,
            "workflowInputs": wf_inputs,
        }

        if workflow_dependencies:
            with open(workflow_dependencies, "rb") as f:
                # Read as Base64 byte string
                wf_dependencies = f.read()
                # Convert to binary zip file
                files["workflowDependencies"] = base64.decodebytes(
                    wf_dependencies)

        # Add workflow options if specified
        if workflow_options:
            with open(workflow_options, "rb") as f:
                wf_options = f.read()
                files["workflowOptions"] = wf_options

        # After Cromwell start, it may take a few seconds to be ready for requests.
        # Poll up to a minute for successful connect and submit.

        job = None
        max_time_wait = 60
        wait_interval = 5

        time.sleep(wait_interval)
        for attempt in range(max_time_wait // wait_interval):
            try:
                job = self.fetch(post=True, files=files)
                break
            except requests.exceptions.ConnectionError as e:
                logging.info("Failed to connect to Cromwell (attempt %d): %s",
                             attempt + 1, e)
                time.sleep(wait_interval)

        if not job:
            sys_util.exit_with_error(
                "Failed to connect to Cromwell after {0} seconds".format(
                    max_time_wait))

        if job["status"] != "Submitted":
            sys_util.exit_with_error(
                "Job status from Cromwell was not 'Submitted', instead '{0}'".
                format(job["status"]))

        # Job is running.
        cromwell_id = job["id"]
        logging.info("Job submitted to Cromwell. job id: %s", cromwell_id)

        # Poll Cromwell for job completion.
        attempt = 0
        max_failed_attempts = 3
        while True:
            time.sleep(sleep_time)

            # Cromwell occassionally fails to respond to the status request.
            # Only give up after 3 consecutive failed requests.
            try:
                status_json = self.fetch(wf_id=cromwell_id, method="status")
                attempt = 0
            except requests.exceptions.ConnectionError as e:
                attempt += 1
                logging.info(
                    "Error polling Cromwell job status (attempt %d): %s",
                    attempt, e)

                if attempt >= max_failed_attempts:
                    sys_util.exit_with_error(
                        "Cromwell did not respond for %d consecutive requests"
                        % attempt)

                continue

            status = status_json["status"]
            if status == "Succeeded":
                break
            elif status == "Submitted":
                pass
            elif status == "Running":
                pass
            else:
                sys_util.exit_with_error(
                    "Status of job is not Submitted, Running, or Succeeded: %s"
                    % status)

        logging.info("Cromwell job status: %s", status)

        # Cromwell produces a list of outputs and full job details
        outputs = self.fetch(wf_id=cromwell_id, method="outputs")
        metadata = self.fetch(wf_id=cromwell_id, method="metadata")

        return outputs, metadata