Example #1
0
def main():

  run_cr_response = arvados.api().container_requests().list(filters=[["state", "=", "Committed"],
                                                                     ["requesting_container_uuid", "=", None],
                                                                     ["priority", ">", "0"]]).execute()


  fin_cr_response = arvados.api().container_requests().list(filters=[["state", "=", "Final"],
                                                                     ["requesting_container_uuid", "=", None],
                                                                     ["priority", ">=", "0"]], limit=100).execute()

  try:
    run_pi_response = arvados.api().pipeline_instances().list(filters=[["state", "=", "RunningOnServer"]]).execute()
    fin_pi_response = arvados.api().pipeline_instances().list(filters=[["state", "!=", "RunningOnServer"]], limit=10).execute()
    print("Currently running Workflows")
    print("UUID, NAME, CREATED AT, OWNER PROJECT, STATUS")
    for item in run_cr_response['items']:
      print(print_status(item))
    for item in run_pi_response['items']:
      print(print_status(item))
    print("")
  except:
    pass

  print("Recently finished Workflows")
  print("UUID, NAME, FINISHED AT, OWNER PROJECT, STATUS")
  for item in fin_cr_response['items']:
    print(print_status(item))
  for item in fin_pi_response['items']:
    print(print_status(item))
def one_task_per_gvcf_group_in_stream(stream_name, gvcf_by_group, gvcf_indices, interval_list_by_group, if_sequence, ref_input_pdh, create_task_func=create_task):
    """
    Process one stream of data and launch a subtask for handling it
    """
    print "Finalising stream %s" % stream_name
    for group_name in sorted(gvcf_by_group.keys()):
        print "Have %s gVCFs in group %s" % (len(gvcf_by_group[group_name]), group_name)
        # require interval_list for this group
        if group_name not in interval_list_by_group:
            raise errors.InvalidArgumentError("Inputs collection did not contain interval_list for group %s" % group_name)
        interval_lists = interval_list_by_group[group_name].keys()
        if len(interval_lists) > 1:
            raise errors.InvalidArgumentError("Inputs collection contained more than one interval_list for group %s: %s" % (group_name, ' '.join(interval_lists)))
        interval_list_manifest = interval_list_by_group[group_name].get(interval_lists[0]).as_manifest()
        # Create a portable data hash for the task's interval_list
        try:
            r = arvados.api().collections().create(body={"manifest_text": interval_list_manifest}).execute()
            interval_list_pdh = r["portable_data_hash"]
        except:
            raise

        task_inputs_manifest = ""
        for ((s_name, gvcf_name), gvcf_f) in gvcf_by_group[group_name].items():
            task_inputs_manifest += gvcf_f.as_manifest()
            gvcf_index_f = gvcf_indices.get((s_name, re.sub(r'vcf.gz$', 'vcf.tbi', gvcf_name)),
                                            gvcf_indices.get((s_name, re.sub(r'vcf.gz$', 'vcf.gz.tbi', gvcf_name)),
                                                             None))
            if gvcf_index_f:
                task_inputs_manifest += gvcf_index_f.as_manifest()
            else:
                # no index for gVCF - TODO: should this be an error or warning?
                print "WARNING: No correponding .tbi index file found for gVCF file %s" % gvcf_name
                #raise errors.InvalidArgumentError("No correponding .tbi index file found for gVCF file %s" % gvcf_name)

        # Create a portable data hash for the task's subcollection
        try:
            r = arvados.api().collections().create(body={"manifest_text": task_inputs_manifest}).execute()
            task_inputs_pdh = r["portable_data_hash"]
        except:
            raise

        # Create task to process this group
        name_components = []
        if len(stream_name) > 0 and stream_name != ".":
            name_components.append(stream_name)
        if len(group_name) > 0:
            name_components.append(group_name)
        if len(name_components) == 0:
            name = "all"
        else:
            name = '::'.join(name_components)

        print "Creating task to process %s" % name
        new_task_params = {
                    'inputs': task_inputs_pdh,
                    'ref': ref_input_pdh,
                    'interval_list': interval_list_pdh,
                    'name': name
                    }
        task = create_task_func(if_sequence + 1, new_task_params)
Example #3
0
    def _test_websocket_reconnect(self, close_unexpected):
        run_test_server.authorize_with('active')
        events = Queue.Queue(100)

        logstream = io.BytesIO()
        rootLogger = logging.getLogger()
        streamHandler = logging.StreamHandler(logstream)
        rootLogger.addHandler(streamHandler)

        filters = [['object_uuid', 'is_a', 'arvados#human']]
        filters.append(['created_at', '>=', self.localiso(self.TIME_PAST)])
        self.ws = arvados.events.subscribe(
            arvados.api('v1'), filters,
            events.put_nowait,
            poll_fallback=False,
            last_log_id=None)
        self.assertIsInstance(self.ws, arvados.events.EventClient)
        self.assertEqual(200, events.get(True, 5)['status'])

        # create obj
        human = arvados.api('v1').humans().create(body={}).execute()

        # expect an event
        self.assertIn(human['uuid'], events.get(True, 5)['object_uuid'])
        with self.assertRaises(Queue.Empty):
            self.assertEqual(events.get(True, 2), None)

        # close (im)properly
        if close_unexpected:
            self.ws.ec.close_connection()
        else:
            self.ws.close()

        # create one more obj
        human2 = arvados.api('v1').humans().create(body={}).execute()

        # (un)expect the object creation event
        if close_unexpected:
            log_object_uuids = []
            for i in range(0, 2):
                event = events.get(True, 5)
                if event.get('object_uuid') != None:
                    log_object_uuids.append(event['object_uuid'])
            with self.assertRaises(Queue.Empty):
                self.assertEqual(events.get(True, 2), None)
            self.assertNotIn(human['uuid'], log_object_uuids)
            self.assertIn(human2['uuid'], log_object_uuids)
        else:
            with self.assertRaises(Queue.Empty):
                self.assertEqual(events.get(True, 2), None)

        # verify log message to ensure that an (un)expected close
        log_messages = logstream.getvalue()
        closeLogFound = log_messages.find("Unexpected close. Reconnecting.")
        retryLogFound = log_messages.find("Error during websocket reconnect. Will retry")
        if close_unexpected:
            self.assertNotEqual(closeLogFound, -1)
        else:
            self.assertEqual(closeLogFound, -1)
        rootLogger.removeHandler(streamHandler)
Example #4
0
def spawn_new_task_per_file(script_parameter, regex, if_sequence=0, and_end_task=True):
    """
    Generalized form of one_task_per_pair_input_file from
    https://github.com/curoverse/arvados/blob/master/crunch_scripts/arvados_bwa.py

    Creates a new task if the file in the collection matches the regex
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters'][script_parameter]
    input_collection = Collection(job_input)
    for name in input_collection:
        if not re.search(regex,name):
            continue
        new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': name,
                        }
                    }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
Example #5
0
def check_fail(container_request_uuid):
  container_uuid = arvados.api().container_requests().list(filters=[["uuid","=", container_request_uuid]]).execute()['items'][0]['container_uuid']
  exit_code = arvados.api().containers().list(filters=[["uuid","=", container_uuid]]).execute()['items'][0]['exit_code']
  if exit_code == 0:
    return 'Complete'
  else:
    return 'Failed'
Example #6
0
def check_project_exists(project_uuid):
    try:
        arvados.api('v1').groups().get(uuid=project_uuid).execute()
    except (apiclient.errors.Error, arvados.errors.NotFoundError) as error:
        raise ValueError("Project {} not found ({})".format(project_uuid,
                                                            error))
    else:
        return True
def pipeline_instance(uuid, job_patterns):
  resp = arvados.api().pipeline_instances().list(filters=[["uuid","=", uuid]]).execute()
  for job in resp.items()[1][1][0]['components']['cwl-runner']['job']['components']:
    for pattern in job_patterns:
      if re.match(pattern, job):
        uuid = resp.items()[1][1][0]['components']['cwl-runner']['job']['components'][job]
        jobresp = arvados.api().jobs().list(filters=[["uuid", "=", uuid]]).execute()
        print job
        print ('\n').join(jobresp.items()[1][1][0]['script_parameters']['tasks'][0]['command'])
Example #8
0
def list_sharing(owner_uuid):
  # List sharing with owner_uuid=owner_uuid
  list = []
  call = arvados.api().groups().get(uuid=owner_uuid).execute()
  writable_uuids = call['writable_by']
  for uuid in writable_uuids:
    user = arvados.api().users().get(uuid=uuid).execute()
    list.append("Writable by %s %s" % (user['full_name'], uuid))
  return list
 def _test_subscribe(self, poll_fallback, expect_type):
     run_test_server.authorize_with('active')
     events = Queue.Queue(3)
     self.ws = arvados.events.subscribe(
         arvados.api('v1'), [['object_uuid', 'is_a', 'arvados#human']],
         events.put, poll_fallback=poll_fallback)
     self.assertIsInstance(self.ws, expect_type)
     self.assertEqual(200, events.get(True, 10)['status'])
     human = arvados.api('v1').humans().create(body={}).execute()
     self.assertEqual(human['uuid'], events.get(True, 10)['object_uuid'])
     self.assertTrue(events.empty(), "got more events than expected")
def container_request(name):
  filter = "%" + name + "%"
  completes = ""
  cr = arvados.api().container_requests().list(filters=[["name","like",filter]], limit=500).execute()
  total = cr['items_available']
  if total != 0:
    print "Container request uuid, Container request name, Container request finish time"
  for num in xrange(0,total):
    c = cr['items'][num]['container_uuid']
    c_resp = arvados.api().containers().list(filters=[["uuid","=",c]]).execute()
    if c_resp['items'][0]['state'] == 'Complete': # api returns in order of time by default so this should get the latest complete instance.
      completes = cr['items'][num]['uuid']
      print "%s %s %s" % (completes, cr['items'][num]['name'], cr['items'][num]['modified_at'])
Example #11
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1':left_file.as_manifest(),
                        'input_2':right_file.as_manifest()
                        }
                    }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                   body={'success':True}
                                   ).execute()
        exit(0)
Example #12
0
def main():

    SCOPES = [
      'https://mail.google.com/',
      'https://www.googleapis.com/auth/gmail.send',
      'https://www.googleapis.com/auth/gmail.compose'
    ]
    arv = arvados.api('v1')

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-f', '--from', dest='from_email', required=True, help="The sender email address (required)")
    parser.add_argument(
        '-t', '--to', dest='to_email', required=True, help="The destination email address (required)")
    parser.add_argument(
        '-c', '--client', dest='client_secret', required=True, help="The path to your client_secret.json (required)")
    parser.add_argument(
        '-s', '--storage', dest='storage', required=True, help="The path to your stored credentials (required)")
    parser.add_argument(
	'-l', '--location', dest='location', required=True, help="The location of the cluster (required)")
    options = parser.parse_args()
    CLIENT_SECRET = options.client_secret

    # All current RunningOnServer
    num_running = arvados.api('v1').pipeline_instances().list(
                      filters=[["state","=","RunningOnServer"]]).execute()["items_available"]
    message = 'There are currently %s pipelines running on %s. \n\n' % (str(num_running), options.location)

    for instance_num in range(0,num_running):
        instance = arvados.api('v1').pipeline_instances().list(
                       filters=[["state","=","RunningOnServer"]]).execute()["items"][instance_num]
        for component, value in instance["components"].iteritems():
            if "job" in value:
                if value["job"]["state"] == 'Running':
		    message += '%s\n%s started at: %s\n' % (instance["uuid"], component, RFC3339Convert_to_readable(value["job"]["started_at"]))
		    message += '%s has been running for %s\n' %(component, Time_diff(RFC3339Convert_to_dt(value["job"]["started_at"]),Current_time()))
		if value["job"]["state"] == 'Queued':
		    message += '%s\n%s is queued, it was created at: %s\n' % (instance["uuid"], component, RFC3339Convert_to_readable(value["job"]["created_at"]))
        message += '\n'

    store = file.Storage(options.storage)
    credz = store.get()
    if not credz or credz.invalid:
        flags = tools.argparser.parse_args(args=[])
        flow = client.flow_from_clientsecrets(CLIENT_SECRET, SCOPES)
        credz = tools.run_flow(flow, store, flags)
    GMAIL = build('gmail', 'v1', http=credz.authorize(Http()))

    message = CreateMessage(options.from_email, options.to_email, '%s pipelines running on %s' % (str(num_running), options.location), message)
    SendMessage(GMAIL, 'me', message)
Example #13
0
def check_fail(container_request_uuid):
  container_uuid = arvados.api().container_requests().list(filters=[["uuid","=", container_request_uuid]]).execute()['items'][0]['container_uuid']
  exit_code = arvados.api().containers().list(filters=[["uuid","=", container_uuid]]).execute()['items'][0]['exit_code']
  state = arvados.api().containers().list(filters=[["uuid","=", container_uuid]]).execute()['items'][0]['state']
  if exit_code == 0:
    return 'Complete'
  elif not exit_code:
    if state == "Running":
      return 'Running'
    elif state == "Queued":
      return 'Queued/Cancelled'
    else:
      return 'Cancelled'
  else:
    return 'Failed'
Example #14
0
def main(args, stdout, stderr, api_client=None):
    args = parse_args(args)

    if api_client is None:
        api_client = arvados.api('v1')

    try:
        cr = arvados.CollectionReader(args.locator, api_client=api_client,
                                      num_retries=args.retries)
        cr.normalize()
    except (arvados.errors.ArgumentError,
            arvados.errors.NotFoundError) as error:
        print("arv-ls: error fetching collection: {}".format(error),
              file=stderr)
        return 1

    formatters = []
    if args.s:
        formatters.append(size_formatter)
    formatters.append(name_formatter)

    for f in cr.all_files():
        print(*(info_func(f) for info_func in formatters), file=stdout)

    return 0
Example #15
0
 def _get_all_pages(self):
     got = 0
     last_id = 0
     filters = [
         ['object_uuid', '=', self.job_uuid],
         ['event_type', '=', 'stderr']]
     try:
         while True:
             page = arvados.api().logs().index(
                 limit=1000,
                 order=['id asc'],
                 filters=filters + [['id','>',str(last_id)]],
                 select=['id', 'properties'],
             ).execute(num_retries=2)
             got += len(page['items'])
             logger.debug(
                 '%s: received %d of %d log events',
                 self.job_uuid, got,
                 got + page['items_available'] - len(page['items']))
             for i in page['items']:
                 for line in i['properties']['text'].split('\n'):
                     self._queue.put(line+'\n')
                 last_id = i['id']
             if (len(page['items']) == 0 or
                 len(page['items']) >= page['items_available']):
                 break
     finally:
         self._queue.put(self.EOF)
Example #16
0
def main(args, stdout, stderr, api_client=None):
    parser = arg_parser()

    job_order_object = None
    arvargs = parser.parse_args(args)
    if arvargs.create_template and not arvargs.job_order:
        job_order_object = ({}, "")

    try:
        if api_client is None:
            api_client=arvados.api('v1', model=OrderedJsonModel())
        runner = ArvCwlRunner(api_client, work_api=arvargs.work_api)
    except Exception as e:
        logger.error(e)
        return 1

    arvargs.conformance_test = None
    arvargs.use_container = True

    return cwltool.main.main(args=arvargs,
                             stdout=stdout,
                             stderr=stderr,
                             executor=runner.arvExecutor,
                             makeTool=runner.arvMakeTool,
                             versionfunc=versionstring,
                             job_order_object=job_order_object)
Example #17
0
def NewSummarizer(process_or_uuid, **kwargs):
    """Construct with the appropriate subclass for this uuid/object."""

    if isinstance(process_or_uuid, dict):
        process = process_or_uuid
        uuid = process['uuid']
    else:
        uuid = process_or_uuid
        process = None
        arv = arvados.api('v1', model=OrderedJsonModel())

    if '-dz642-' in uuid:
        if process is None:
            process = arv.containers().get(uuid=uuid).execute()
        klass = ContainerTreeSummarizer
    elif '-xvhdp-' in uuid:
        if process is None:
            process = arv.container_requests().get(uuid=uuid).execute()
        klass = ContainerTreeSummarizer
    elif '-8i9sb-' in uuid:
        if process is None:
            process = arv.jobs().get(uuid=uuid).execute()
        klass = JobTreeSummarizer
    elif '-d1hrv-' in uuid:
        if process is None:
            process = arv.pipeline_instances().get(uuid=uuid).execute()
        klass = PipelineSummarizer
    elif '-4zz18-' in uuid:
        return CollectionSummarizer(collection_id=uuid)
    else:
        raise ArgumentError("Unrecognized uuid %s", uuid)
    return klass(process, uuid=uuid, **kwargs)
def container_request(uuid):
  cr_uuid = uuid
  cr = arvados.api().container_requests().list(filters=[["uuid", "=", cr_uuid]]).execute()
  c_uuid = cr.items()[1][1][0]['container_uuid']
  req_cs = arvados.api().container_requests().list(filters=[["requesting_container_uuid","=",c_uuid]], limit=1000).execute()
  print "CR_UUID, CR NAME, CR_OUTPUT_UUID, TRASHED_AT"
  for item in req_cs['items']:
    cr_output_uuid = item['output_uuid']
    try:
      trash = arvados.api().collections().list(filters=[["uuid","=", cr_output_uuid]]).execute()['items'][0]['trash_at']
      if not trash:
        print item['uuid'], item['name'], cr_output_uuid, "Null"
      else:
        print item['uuid'], item['name'], cr_output_uuid, trash
    except:
      print "item['uuid'] does not have an output collection"
Example #19
0
def create_project_link(locator, link):
    link['head_uuid'] = locator
    link.setdefault('name', "Collection saved by {}@{} at {}".format(
            pwd.getpwuid(os.getuid()).pw_name,
            socket.gethostname(),
            datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")))
    return arvados.api('v1').links().create(body=link).execute()
Example #20
0
def run_keep_proxy():
    if 'ARVADOS_TEST_PROXY_SERVICES' in os.environ:
        return
    stop_keep_proxy()

    port = find_available_port()
    env = os.environ.copy()
    env['ARVADOS_API_TOKEN'] = auth_token('anonymous')
    logf = open(_logfilename('keepproxy'), 'a')
    kp = subprocess.Popen(
        ['keepproxy',
         '-pid='+_pidfile('keepproxy'),
         '-listen=:{}'.format(port)],
        env=env, stdin=open('/dev/null'), stdout=logf, stderr=logf, close_fds=True)

    api = arvados.api(
        version='v1',
        host=os.environ['ARVADOS_API_HOST'],
        token=auth_token('admin'),
        insecure=True)
    for d in api.keep_services().list(
            filters=[['service_type','=','proxy']]).execute()['items']:
        api.keep_services().delete(uuid=d['uuid']).execute()
    api.keep_services().create(body={'keep_service': {
        'service_host': 'localhost',
        'service_port': port,
        'service_type': 'proxy',
        'service_ssl_flag': False,
    }}).execute()
    os.environ["ARVADOS_KEEP_SERVICES"] = "http://localhost:{}".format(port)
    _setport('keepproxy', port)
    _wait_until_port_listens(port)
Example #21
0
    def runTest(self):
        run_test_server.authorize_with("admin")
        api = arvados.api('v1', cache=False)

        operations = fuse.Operations(os.getuid(), os.getgid())
        e = operations.inodes.add_entry(fuse.TagsDirectory(llfuse.ROOT_INODE, operations.inodes, api))

        llfuse.init(operations, self.mounttmp, [])
        t = threading.Thread(None, lambda: llfuse.main())
        t.start()

        # wait until the driver is finished initializing
        operations.initlock.wait()

        d1 = os.listdir(self.mounttmp)
        d1.sort()
        self.assertEqual(['foo_tag'], d1)

        d2 = os.listdir(os.path.join(self.mounttmp, 'foo_tag'))
        d2.sort()
        self.assertEqual(['1f4b0bc7583c2a7f9102c395f4ffc5e3+45'], d2)

        d3 = os.listdir(os.path.join(self.mounttmp, 'foo_tag', '1f4b0bc7583c2a7f9102c395f4ffc5e3+45'))
        d3.sort()
        self.assertEqual(['foo'], d3)

        files = {}
        files[os.path.join(self.mounttmp, 'foo_tag', '1f4b0bc7583c2a7f9102c395f4ffc5e3+45', 'foo')] = 'foo'

        for k, v in files.items():
            with open(os.path.join(self.mounttmp, k)) as f:
                self.assertEqual(v, f.read())
Example #22
0
    def test_ArvPutSignedManifest(self):
        # ArvPutSignedManifest runs "arv-put foo" and then attempts to get
        # the newly created manifest from the API server, testing to confirm
        # that the block locators in the returned manifest are signed.
        self.authorize_with('active')

        # Before doing anything, demonstrate that the collection
        # we're about to create is not present in our test fixture.
        api = arvados.api('v1', cache=False)
        manifest_uuid = "00b4e9f40ac4dd432ef89749f1c01e74+47"
        with self.assertRaises(apiclient.errors.HttpError):
            notfound = api.collections().get(uuid=manifest_uuid).execute()

        datadir = tempfile.mkdtemp()
        with open(os.path.join(datadir, "foo"), "w") as f:
            f.write("The quick brown fox jumped over the lazy dog")
        p = subprocess.Popen([sys.executable, arv_put.__file__, datadir],
                             stdout=subprocess.PIPE, env=self.ENVIRON)
        (arvout, arverr) = p.communicate()
        self.assertEqual(p.returncode, 0)
        self.assertEqual(arverr, None)
        self.assertEqual(arvout.strip(), manifest_uuid)

        # The manifest text stored in the API server under the same
        # manifest UUID must use signed locators.
        c = api.collections().get(uuid=manifest_uuid).execute()
        self.assertRegexpMatches(
            c['manifest_text'],
            r'^\. 08a008a01d498c404b0c30852b39d3b8\+44\+A[0-9a-f]+@[0-9a-f]+ 0:44:foo\n')

        os.remove(os.path.join(datadir, "foo"))
        os.rmdir(datadir)
Example #23
0
 def test_exceptions_without_errors_have_basic_info(self):
     mock_responses = {"arvados.humans.delete": (fake_httplib2_response(500, **self.ERROR_HEADERS), "")}
     req_builder = apiclient_http.RequestMockBuilder(mock_responses)
     api = arvados.api("v1", requestBuilder=req_builder)
     with self.assertRaises(apiclient_errors.HttpError) as err_ctx:
         api.humans().delete(uuid="xyz-xyz-abcdef").execute()
     self.assertIn("500", str(err_ctx.exception))
Example #24
0
 def authorize_with(self, token_name):
     run_test_server.authorize_with(token_name)
     for v in ["ARVADOS_API_HOST",
               "ARVADOS_API_HOST_INSECURE",
               "ARVADOS_API_TOKEN"]:
         self.ENVIRON[v] = arvados.config.settings()[v]
     arv_put.api_client = arvados.api('v1')
def get_reusable_tasks(sequence, task_key_params, job_filters):
    reusable_tasks = {}
    jobs = get_jobs_for_task_reuse(job_filters)
    print "Found %s similar previous jobs, checking them for reusable tasks" % (jobs['items_available'])
    task_filters = [
        ['job_uuid', 'in', [job['uuid'] for job in jobs['items']]],
        ['sequence', '=', str(sequence)],
        ['success', '=', 'True'],
    ]
    #print "Querying API server for tasks matching filters %s" % (json.dumps(task_filters))
    tasks = execute_list_all(arvados.api().job_tasks(),
                             distinct=True,
                             select=['uuid', 'job_uuid', 'output',
                                     'parameters', 'success',
                                     'progress', 'started_at',
                                     'finished_at'],
                            filters=task_filters)
    if tasks['items_available'] > 0:
        print "Have %s potential reusable task outputs" % ( tasks['items_available'] )
        for task in tasks['items']:
            have_all_params=True
            for index_param in task_key_params:
                if index_param not in task['parameters']:
                    print "WARNING: missing task key param %s in JobTask %s from Job %s (have parameters: %s)" % (index_param, task['uuid'], task['job_uuid'], ', '.join(task['parameters'].keys()))
                    have_all_params=False
            if have_all_params:
                ct_index = tuple([task['parameters'][index_param] for index_param in task_key_params])
                if ct_index in reusable_tasks:
                    # we have already seen a task with these parameters (from another job?) - verify they have the same output
                    if reusable_tasks[ct_index]['output'] != task['output']:
                        print "WARNING: found two existing candidate JobTasks for parameters %s and the output does not match! (using JobTask %s from Job %s with output %s, but JobTask %s from Job %s had output %s)" % (ct_index, reusable_tasks[ct_index]['uuid'], reusable_tasks[ct_index]['job_uuid'], reusable_tasks[ct_index]['output'], task['uuid'], task['job_uuid'], task['output'])
                else:
                    # store the candidate task in reusable_tasks, indexed on the tuple of params specified in task_key_params
                    reusable_tasks[ct_index] = task
    return reusable_tasks
Example #26
0
def run_keep_proxy():
    stop_keep_proxy()

    admin_token = auth_token('admin')
    port = find_available_port()
    env = os.environ.copy()
    env['ARVADOS_API_TOKEN'] = admin_token
    kp = subprocess.Popen(
        ['keepproxy',
         '-pid={}/keepproxy.pid'.format(TEST_TMPDIR),
         '-listen=:{}'.format(port)],
        env=env)

    api = arvados.api(
        version='v1',
        host=os.environ['ARVADOS_API_HOST'],
        token=admin_token,
        insecure=True)
    for d in api.keep_services().list(
            filters=[['service_type','=','proxy']]).execute()['items']:
        api.keep_services().delete(uuid=d['uuid']).execute()
    api.keep_services().create(body={'keep_service': {
        'service_host': 'localhost',
        'service_port': port,
        'service_type': 'proxy',
        'service_ssl_flag': False,
    }}).execute()
    os.environ["ARVADOS_KEEP_PROXY"] = "http://localhost:{}".format(port)
Example #27
0
def run_keep(blob_signing_key=None, enforce_permissions=False):
    stop_keep()

    keep_args = {}
    if blob_signing_key:
        with open(os.path.join(TEST_TMPDIR, "keep.blob_signing_key"), "w") as f:
            keep_args['--permission-key-file'] = f.name
            f.write(blob_signing_key)
    if enforce_permissions:
        keep_args['--enforce-permissions'] = 'true'

    api = arvados.api(
        version='v1',
        host=os.environ['ARVADOS_API_HOST'],
        token=os.environ['ARVADOS_API_TOKEN'],
        insecure=True)
    for d in api.keep_services().list().execute()['items']:
        api.keep_services().delete(uuid=d['uuid']).execute()
    for d in api.keep_disks().list().execute()['items']:
        api.keep_disks().delete(uuid=d['uuid']).execute()

    for d in range(0, 2):
        port = _start_keep(d, keep_args)
        svc = api.keep_services().create(body={'keep_service': {
            'uuid': 'zzzzz-bi6l4-keepdisk{:07d}'.format(d),
            'service_host': 'localhost',
            'service_port': port,
            'service_type': 'disk',
            'service_ssl_flag': False,
        }}).execute()
        api.keep_disks().create(body={
            'keep_disk': {'keep_service_uuid': svc['uuid'] }
        }).execute()
def create_or_reuse_task(sequence, parameters, reusable_tasks, task_key_params, validate_task_output):
    new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': sequence,
            'parameters': parameters
            }
    # See if there is a task in reusable_tasks that can be reused
    ct_index = tuple([parameters[index_param] for index_param in task_key_params])
    if len(reusable_tasks) == 0:
        print "No reusable tasks were available"
    elif ct_index in reusable_tasks:
        # have a task from which to reuse the output, prepare to create a new, but already finished, task with that output
        reuse_task = reusable_tasks[ct_index]
        if validate_task_output(reuse_task['output']):
            print "Found existing JobTask %s from Job %s. Will use output %s from that JobTask instead of re-running it." % (reuse_task['uuid'], reuse_task['job_uuid'], reuse_task['output'])
            # remove task from reusable_tasks as it won't be used more than once
            del reusable_tasks[ct_index]
            # copy relevant attrs from reuse_task so that the new tasks start already finished
            for attr in ['success', 'output', 'progress', 'started_at', 'finished_at', 'parameters']:
                new_task_attrs[attr] = reuse_task[attr]
            # crunch seems to ignore the fact that the job says it is done and queue it anyway
            # signal ourselves to just immediately exit successfully when we are run
            new_task_attrs['parameters']['reuse_job_task'] = reuse_task['uuid']
        else:
            print "Output %s for potential task reuse did not validate" % (reuse_task['output'])
    else:
        print "No reusable JobTask matched key parameters %s" % (list(ct_index))

    # Create the "new" task (may be new work or may be already finished work)
    new_task = arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if not new_task:
        raise errors.APIError("Attempt to create new job_task failed: [%s]" % new_task_attrs)
    return new_task
Example #29
0
    def test_websocket_reconnect_retry(self, event_client_connect):
        event_client_connect.side_effect = [None, Exception('EventClient.connect error'), None]

        logstream = io.BytesIO()
        rootLogger = logging.getLogger()
        streamHandler = logging.StreamHandler(logstream)
        rootLogger.addHandler(streamHandler)

        run_test_server.authorize_with('active')
        events = Queue.Queue(100)

        filters = [['object_uuid', 'is_a', 'arvados#human']]
        self.ws = arvados.events.subscribe(
            arvados.api('v1'), filters,
            events.put_nowait,
            poll_fallback=False,
            last_log_id=None)
        self.assertIsInstance(self.ws, arvados.events.EventClient)

        # simulate improper close
        self.ws.on_closed()

        # verify log messages to ensure retry happened
        log_messages = logstream.getvalue()
        found = log_messages.find("Error 'EventClient.connect error' during websocket reconnect.")
        self.assertNotEqual(found, -1)
        rootLogger.removeHandler(streamHandler)
Example #30
0
def api_for_instance(instance_name):
    if '/' in instance_name:
        config_file = instance_name
    else:
        config_file = os.path.join(os.environ['HOME'], '.config', 'arvados', "{}.conf".format(instance_name))

    try:
        cfg = arvados.config.load(config_file)
    except (IOError, OSError) as e:
        abort(("Could not open config file {}: {}\n" +
               "You must make sure that your configuration tokens\n" +
               "for Arvados instance {} are in {} and that this\n" +
               "file is readable.").format(
                   config_file, e, instance_name, config_file))

    if 'ARVADOS_API_HOST' in cfg and 'ARVADOS_API_TOKEN' in cfg:
        api_is_insecure = (
            cfg.get('ARVADOS_API_HOST_INSECURE', '').lower() in set(
                ['1', 't', 'true', 'y', 'yes']))
        client = arvados.api('v1',
                             host=cfg['ARVADOS_API_HOST'],
                             token=cfg['ARVADOS_API_TOKEN'],
                             insecure=api_is_insecure,
                             model=OrderedJsonModel())
    else:
        abort('need ARVADOS_API_HOST and ARVADOS_API_TOKEN for {}'.format(instance_name))
    return client
Example #31
0
def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
    global api_client

    args = parse_arguments(arguments)
    status = 0
    if api_client is None:
        api_client = arvados.api('v1')

    # Determine the name to use
    if args.name:
        if args.stream or args.raw:
            print >> stderr, "Cannot use --name with --stream or --raw"
            sys.exit(1)
        collection_name = args.name
    else:
        collection_name = "Saved at {} by {}@{}".format(
            datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
            pwd.getpwuid(os.getuid()).pw_name, socket.gethostname())

    if args.project_uuid and (args.stream or args.raw):
        print >> stderr, "Cannot use --project-uuid with --stream or --raw"
        sys.exit(1)

    # Determine the parent project
    try:
        project_uuid = desired_project_uuid(api_client, args.project_uuid,
                                            args.retries)
    except (apiclient_errors.Error, ValueError) as error:
        print >> stderr, error
        sys.exit(1)

    # write_copies diverges from args.replication here.
    # args.replication is how many copies we will instruct Arvados to
    # maintain (by passing it in collections().create()) after all
    # data is written -- and if None was given, we'll use None there.
    # Meanwhile, write_copies is how many copies of each data block we
    # write to Keep, which has to be a number.
    #
    # If we simply changed args.replication from None to a default
    # here, we'd end up erroneously passing the default replication
    # level (instead of None) to collections().create().
    write_copies = (args.replication or api_client._rootDesc.get(
        'defaultCollectionReplication', 2))

    if args.progress:
        reporter = progress_writer(human_progress)
    elif args.batch_progress:
        reporter = progress_writer(machine_progress)
    else:
        reporter = None
    bytes_expected = expected_bytes_for(args.paths)

    resume_cache = None
    if args.resume:
        try:
            resume_cache = ResumeCache(ResumeCache.make_path(args))
        except (IOError, OSError, ValueError):
            pass  # Couldn't open cache directory/file.  Continue without it.
        except ResumeCacheConflict:
            print >> stderr, "\n".join([
                "arv-put: Another process is already uploading this data.",
                "         Use --no-resume if this is really what you want."
            ])
            sys.exit(1)

    if resume_cache is None:
        writer = ArvPutCollectionWriter(resume_cache,
                                        reporter,
                                        bytes_expected,
                                        num_retries=args.retries,
                                        replication=write_copies)
    else:
        writer = ArvPutCollectionWriter.from_cache(resume_cache,
                                                   reporter,
                                                   bytes_expected,
                                                   num_retries=args.retries,
                                                   replication=write_copies)

    # Install our signal handler for each code in CAUGHT_SIGNALS, and save
    # the originals.
    orig_signal_handlers = {
        sigcode: signal.signal(sigcode, exit_signal_handler)
        for sigcode in CAUGHT_SIGNALS
    }

    if writer.bytes_written > 0:  # We're resuming a previous upload.
        print >> stderr, "\n".join([
            "arv-put: Resuming previous upload from last checkpoint.",
            "         Use the --no-resume option to start over."
        ])

    writer.report_progress()
    writer.do_queued_work()  # Do work resumed from cache.
    for path in args.paths:  # Copy file data to Keep.
        if os.path.isdir(path):
            writer.write_directory_tree(
                path, max_manifest_depth=args.max_manifest_depth)
        else:
            writer.start_new_stream()
            writer.write_file(path, args.filename or os.path.basename(path))
    writer.finish_current_stream()

    if args.progress:  # Print newline to split stderr from stdout for humans.
        print >> stderr

    if args.stream:
        output = writer.manifest_text()
        if args.normalize:
            output = CollectionReader(output).manifest_text(normalize=True)
    elif args.raw:
        output = ','.join(writer.data_locators())
    else:
        try:
            manifest_text = writer.manifest_text()
            if args.normalize:
                manifest_text = CollectionReader(manifest_text).manifest_text(
                    normalize=True)
            replication_attr = 'replication_desired'
            if api_client._schema.schemas['Collection']['properties'].get(
                    replication_attr, None) is None:
                # API called it 'redundancy' before #3410.
                replication_attr = 'redundancy'
            # Register the resulting collection in Arvados.
            collection = api_client.collections().create(
                body={
                    'owner_uuid': project_uuid,
                    'name': collection_name,
                    'manifest_text': manifest_text,
                    replication_attr: args.replication,
                },
                ensure_unique_name=True).execute(num_retries=args.retries)

            print >> stderr, "Collection saved as '%s'" % collection['name']

            if args.portable_data_hash and 'portable_data_hash' in collection and collection[
                    'portable_data_hash']:
                output = collection['portable_data_hash']
            else:
                output = collection['uuid']

        except apiclient_errors.Error as error:
            print >> stderr, (
                "arv-put: Error creating Collection on project: {}.".format(
                    error))
            status = 1

    # Print the locator (uuid) of the new collection.
    stdout.write(output)
    if not output.endswith('\n'):
        stdout.write('\n')

    for sigcode, orig_handler in orig_signal_handlers.items():
        signal.signal(sigcode, orig_handler)

    if status != 0:
        sys.exit(status)

    if resume_cache is not None:
        resume_cache.destroy()

    return output
Example #32
0
    def _test_websocket_reconnect(self, close_unexpected):
        run_test_server.authorize_with('active')
        events = queue.Queue(100)

        logstream = tutil.StringIO()
        rootLogger = logging.getLogger()
        streamHandler = logging.StreamHandler(logstream)
        rootLogger.addHandler(streamHandler)

        filters = [['object_uuid', 'is_a', 'arvados#human']]
        filters.append(['created_at', '>=', self.localiso(self.TIME_PAST)])
        self.ws = arvados.events.subscribe(arvados.api('v1'),
                                           filters,
                                           events.put_nowait,
                                           poll_fallback=False,
                                           last_log_id=None)
        self.assertIsInstance(self.ws, arvados.events.EventClient)
        self.assertEqual(200, events.get(True, 5)['status'])

        # create obj
        human = arvados.api('v1').humans().create(body={}).execute()

        # expect an event
        self.assertIn(human['uuid'], events.get(True, 5)['object_uuid'])
        with self.assertRaises(queue.Empty):
            self.assertEqual(events.get(True, 2), None)

        # close (im)properly
        if close_unexpected:
            self.ws.ec.close_connection()
        else:
            self.ws.close()

        # create one more obj
        human2 = arvados.api('v1').humans().create(body={}).execute()

        # (un)expect the object creation event
        if close_unexpected:
            log_object_uuids = []
            for i in range(0, 2):
                event = events.get(True, 5)
                if event.get('object_uuid') != None:
                    log_object_uuids.append(event['object_uuid'])
            with self.assertRaises(queue.Empty):
                self.assertEqual(events.get(True, 2), None)
            self.assertNotIn(human['uuid'], log_object_uuids)
            self.assertIn(human2['uuid'], log_object_uuids)
        else:
            with self.assertRaises(queue.Empty):
                self.assertEqual(events.get(True, 2), None)

        # verify log message to ensure that an (un)expected close
        log_messages = logstream.getvalue()
        closeLogFound = log_messages.find("Unexpected close. Reconnecting.")
        retryLogFound = log_messages.find(
            "Error during websocket reconnect. Will retry")
        if close_unexpected:
            self.assertNotEqual(closeLogFound, -1)
        else:
            self.assertEqual(closeLogFound, -1)
        rootLogger.removeHandler(streamHandler)
Example #33
0
def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
    global api_client

    logger = logging.getLogger('arvados.arv_put')
    logger.setLevel(logging.INFO)
    args = parse_arguments(arguments)
    status = 0
    if api_client is None:
        api_client = arvados.api('v1')

    # Determine the name to use
    if args.name:
        if args.stream or args.raw:
            logger.error("Cannot use --name with --stream or --raw")
            sys.exit(1)
        elif args.update_collection:
            logger.error("Cannot use --name with --update-collection")
            sys.exit(1)
        collection_name = args.name
    else:
        collection_name = "Saved at {} by {}@{}".format(
            datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
            pwd.getpwuid(os.getuid()).pw_name, socket.gethostname())

    if args.project_uuid and (args.stream or args.raw):
        logger.error("Cannot use --project-uuid with --stream or --raw")
        sys.exit(1)

    # Determine the parent project
    try:
        project_uuid = desired_project_uuid(api_client, args.project_uuid,
                                            args.retries)
    except (apiclient_errors.Error, ValueError) as error:
        logger.error(error)
        sys.exit(1)

    if args.progress:
        reporter = progress_writer(human_progress)
    elif args.batch_progress:
        reporter = progress_writer(machine_progress)
    else:
        reporter = None

    # If this is used by a human, and there's at least one directory to be
    # uploaded, the expected bytes calculation can take a moment.
    if args.progress and any([os.path.isdir(f) for f in args.paths]):
        logger.info("Calculating upload size, this could take some time...")
    bytes_expected = expected_bytes_for(args.paths)

    try:
        writer = ArvPutUploadJob(paths=args.paths,
                                 resume=args.resume,
                                 use_cache=args.use_cache,
                                 filename=args.filename,
                                 reporter=reporter,
                                 bytes_expected=bytes_expected,
                                 num_retries=args.retries,
                                 replication_desired=args.replication,
                                 put_threads=args.threads,
                                 name=collection_name,
                                 owner_uuid=project_uuid,
                                 ensure_unique_name=True,
                                 update_collection=args.update_collection,
                                 logger=logger,
                                 dry_run=args.dry_run)
    except ResumeCacheConflict:
        logger.error("\n".join([
            "arv-put: Another process is already uploading this data.",
            "         Use --no-cache if this is really what you want."
        ]))
        sys.exit(1)
    except CollectionUpdateError as error:
        logger.error("\n".join(["arv-put: %s" % str(error)]))
        sys.exit(1)
    except ArvPutUploadIsPending:
        # Dry run check successful, return proper exit code.
        sys.exit(2)
    except ArvPutUploadNotPending:
        # No files pending for upload
        sys.exit(0)

    # Install our signal handler for each code in CAUGHT_SIGNALS, and save
    # the originals.
    orig_signal_handlers = {
        sigcode: signal.signal(sigcode, exit_signal_handler)
        for sigcode in CAUGHT_SIGNALS
    }

    if not args.dry_run and not args.update_collection and args.resume and writer.bytes_written > 0:
        logger.warning("\n".join([
            "arv-put: Resuming previous upload from last checkpoint.",
            "         Use the --no-resume option to start over."
        ]))

    if not args.dry_run:
        writer.report_progress()
    output = None
    try:
        writer.start(save_collection=not (args.stream or args.raw))
    except arvados.errors.ApiError as error:
        logger.error("\n".join(["arv-put: %s" % str(error)]))
        sys.exit(1)
    except ArvPutUploadIsPending:
        # Dry run check successful, return proper exit code.
        sys.exit(2)
    except ArvPutUploadNotPending:
        # No files pending for upload
        sys.exit(0)

    if args.progress:  # Print newline to split stderr from stdout for humans.
        logger.info("\n")

    if args.stream:
        if args.normalize:
            output = writer.manifest_text(normalize=True)
        else:
            output = writer.manifest_text()
    elif args.raw:
        output = ','.join(writer.data_locators())
    else:
        try:
            if args.update_collection:
                logger.info("Collection updated: '{}'".format(
                    writer.collection_name()))
            else:
                logger.info("Collection saved as '{}'".format(
                    writer.collection_name()))
            if args.portable_data_hash:
                output = writer.portable_data_hash()
            else:
                output = writer.manifest_locator()
        except apiclient_errors.Error as error:
            logger.error(
                "arv-put: Error creating Collection on project: {}.".format(
                    error))
            status = 1

    # Print the locator (uuid) of the new collection.
    if output is None:
        status = status or 1
    else:
        stdout.write(output)
        if not output.endswith('\n'):
            stdout.write('\n')

    for sigcode, orig_handler in orig_signal_handlers.items():
        signal.signal(sigcode, orig_handler)

    if status != 0:
        sys.exit(status)

    # Success!
    return output
Example #34
0
def main(arguments=None, stdout=sys.stdout):
    args = arg_parser.parse_args(arguments)
    api = arvados.api('v1')

    if args.image is None or args.image == 'images':
        fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
        stdout.write(
            fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION",
                       "CREATED"))
        try:
            for i, j in list_images_in_arv(api, args.retries):
                stdout.write(
                    fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i,
                               j["timestamp"].strftime("%c")))
        except IOError as e:
            if e.errno == errno.EPIPE:
                pass
            else:
                raise
        sys.exit(0)

    # Pull the image if requested, unless the image is specified as a hash
    # that we already have.
    if args.pull and not find_image_hashes(args.image):
        pull_image(args.image, args.tag)

    try:
        image_hash = find_one_image_hash(args.image, args.tag)
    except DockerError as error:
        logger.error(error.message)
        sys.exit(1)

    if not docker_image_compatible(api, image_hash):
        if args.force_image_format:
            logger.warning("forcing incompatible image")
        else:
            logger.error("refusing to store " \
                "incompatible format (use --force-image-format to override)")
            sys.exit(1)

    image_repo_tag = '{}:{}'.format(
        args.image,
        args.tag) if not image_hash.startswith(args.image.lower()) else None

    if args.name is None:
        if image_repo_tag:
            collection_name = 'Docker image {} {}'.format(
                image_repo_tag, image_hash[0:12])
        else:
            collection_name = 'Docker image {}'.format(image_hash[0:12])
    else:
        collection_name = args.name

    if not args.force:
        # Check if this image is already in Arvados.

        # Project where everything should be owned
        if args.project_uuid:
            parent_project_uuid = args.project_uuid
        else:
            parent_project_uuid = api.users().current().execute(
                num_retries=args.retries)['uuid']

        # Find image hash tags
        existing_links = _get_docker_links(
            api,
            args.retries,
            filters=[['link_class', '=', 'docker_image_hash'],
                     ['name', '=', image_hash]])
        if existing_links:
            # get readable collections
            collections = api.collections().list(
                filters=[[
                    'uuid', 'in',
                    [link['head_uuid'] for link in existing_links]
                ]],
                select=["uuid", "owner_uuid", "name", "manifest_text"
                        ]).execute(num_retries=args.retries)['items']

            if collections:
                # check for repo+tag links on these collections
                if image_repo_tag:
                    existing_repo_tag = _get_docker_links(
                        api,
                        args.retries,
                        filters=[['link_class', '=', 'docker_image_repo+tag'],
                                 ['name', '=', image_repo_tag],
                                 [
                                     'head_uuid', 'in',
                                     [c["uuid"] for c in collections]
                                 ]])
                else:
                    existing_repo_tag = []

                try:
                    coll_uuid = next(
                        items_owned_by(parent_project_uuid,
                                       collections))['uuid']
                except StopIteration:
                    # create new collection owned by the project
                    coll_uuid = api.collections().create(
                        body={
                            "manifest_text": collections[0]['manifest_text'],
                            "name": collection_name,
                            "owner_uuid": parent_project_uuid
                        },
                        ensure_unique_name=True).execute(
                            num_retries=args.retries)['uuid']

                link_base = {
                    'owner_uuid': parent_project_uuid,
                    'head_uuid': coll_uuid,
                    'properties': existing_links[0]['properties']
                }

                if not any(items_owned_by(parent_project_uuid,
                                          existing_links)):
                    # create image link owned by the project
                    make_link(api, args.retries, 'docker_image_hash',
                              image_hash, **link_base)

                if image_repo_tag and not any(
                        items_owned_by(parent_project_uuid,
                                       existing_repo_tag)):
                    # create repo+tag link owned by the project
                    make_link(api, args.retries, 'docker_image_repo+tag',
                              image_repo_tag, **link_base)

                stdout.write(coll_uuid + "\n")

                sys.exit(0)

    # Open a file for the saved image, and write it if needed.
    outfile_name = '{}.tar'.format(image_hash)
    image_file, need_save = prep_image_file(outfile_name)
    if need_save:
        save_image(image_hash, image_file)

    # Call arv-put with switches we inherited from it
    # (a.k.a., switches that aren't our own).
    put_args = keepdocker_parser.parse_known_args(arguments)[1]

    if args.name is None:
        put_args += ['--name', collection_name]

    coll_uuid = arv_put.main(put_args +
                             ['--filename', outfile_name, image_file.name],
                             stdout=stdout).strip()

    # Read the image metadata and make Arvados links from it.
    image_file.seek(0)
    image_tar = tarfile.open(fileobj=image_file)
    image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
    if image_hash_type:
        json_filename = raw_image_hash + '.json'
    else:
        json_filename = raw_image_hash + '/json'
    json_file = image_tar.extractfile(image_tar.getmember(json_filename))
    image_metadata = json.load(json_file)
    json_file.close()
    image_tar.close()
    link_base = {'head_uuid': coll_uuid, 'properties': {}}
    if 'created' in image_metadata:
        link_base['properties']['image_timestamp'] = image_metadata['created']
    if args.project_uuid is not None:
        link_base['owner_uuid'] = args.project_uuid

    make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
    if image_repo_tag:
        make_link(api, args.retries, 'docker_image_repo+tag', image_repo_tag,
                  **link_base)

    # Clean up.
    image_file.close()
    for filename in [stat_cache_name(image_file), image_file.name]:
        try:
            os.unlink(filename)
        except OSError as error:
            if error.errno != errno.ENOENT:
                raise
Example #35
0
    def test_image_format_compatibility(self, _1, _2):
        old_id = hashlib.sha256(b'old').hexdigest()
        new_id = 'sha256:' + hashlib.sha256(b'new').hexdigest()
        for supported, img_id, expect_ok in [(['v1'], old_id, True),
                                             (['v1'], new_id, False),
                                             (None, old_id, False),
                                             ([], old_id, False),
                                             ([], new_id, False),
                                             (['v1', 'v2'], new_id, True),
                                             (['v1'], new_id, False),
                                             (['v2'], new_id, True)]:

            fakeDD = arvados.api('v1')._rootDesc
            if supported is None:
                del fakeDD['dockerImageFormats']
            else:
                fakeDD['dockerImageFormats'] = supported

            err = tutil.StringIO()
            out = tutil.StringIO()

            with tutil.redirected_streams(stdout=out), \
                 mock.patch('arvados.api') as api, \
                 mock.patch('arvados.commands.keepdocker.popen_docker',
                            return_value=subprocess.Popen(
                                ['echo', img_id],
                                stdout=subprocess.PIPE)), \
                 mock.patch('arvados.commands.keepdocker.prep_image_file',
                            side_effect=StopTest), \
                 self.assertRaises(StopTest if expect_ok else SystemExit):

                api()._rootDesc = fakeDD
                self.run_arv_keepdocker(['--force', 'testimage'], err)

            self.assertEqual(out.getvalue(), '')
            if expect_ok:
                self.assertNotRegex(err.getvalue(),
                                    "refusing to store",
                                    msg=repr((supported, img_id)))
            else:
                self.assertRegex(err.getvalue(),
                                 "refusing to store",
                                 msg=repr((supported, img_id)))
            if not supported:
                self.assertRegex(
                    err.getvalue(),
                    "server does not specify supported image formats",
                    msg=repr((supported, img_id)))

        fakeDD = arvados.api('v1')._rootDesc
        fakeDD['dockerImageFormats'] = ['v1']
        err = tutil.StringIO()
        out = tutil.StringIO()
        with tutil.redirected_streams(stdout=out), \
             mock.patch('arvados.api') as api, \
             mock.patch('arvados.commands.keepdocker.popen_docker',
                        return_value=subprocess.Popen(
                            ['echo', new_id],
                            stdout=subprocess.PIPE)), \
             mock.patch('arvados.commands.keepdocker.prep_image_file',
                        side_effect=StopTest), \
             self.assertRaises(StopTest):
            api()._rootDesc = fakeDD
            self.run_arv_keepdocker(
                ['--force', '--force-image-format', 'testimage'], err)
        self.assertRegex(err.getvalue(), "forcing incompatible image")
import pkg_resources
import yaml
import re
import string
import arvados

api = arvados.api()


def type_to_heading(type_name):
    """
    Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading.
    """

    # Remove camel case
    decamel = re.sub('([A-Z])', r' \1', type_name)
    # Split
    parts = decamel.split()
    # Capitalize words and remove unwanted components
    filtered = [
        part.capitalize() for part in parts
        if (part.lower() != 'schema' and part != '')
    ]
    # Reassemble
    return ' '.join(filtered)


def name_to_label(field_name):
    """
    Turn a filed name like "host_health_status" from the metadata schema into a human-readable label.
    """
Example #37
0
 def test_nonempty_list(self):
     answer = arvados.api('v1').collections().list().execute()
     self.assertNotEqual(0, answer['items_available'])
     self.assertNotEqual(0, len(answer['items']))
Example #38
0
 def test_new_api_objects_with_cache(self):
     clients = [arvados.api('v1', cache=True) for index in [0, 1]]
     self.assertIsNot(*clients)
Example #39
0
def run():
    # Timestamps are added by crunch-job, so don't print redundant timestamps.
    arvados.log_handler.setFormatter(
        logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    # Print package versions
    logger.info(arvados_cwl.versionstring())

    api = arvados.api("v1")

    arvados_cwl.add_arv_hints()

    runner = None
    try:
        job_order_object = arvados.current_job()['script_parameters']
        toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'],
                                     job_order_object.pop("cwl:tool"))

        pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')

        def keeppath(v):
            if pdh_path.match(v):
                return "keep:%s" % v
            else:
                return v

        def keeppathObj(v):
            if "location" in v:
                v["location"] = keeppath(v["location"])

        for k, v in job_order_object.items():
            if isinstance(
                    v,
                    basestring) and arvados.util.keep_locator_pattern.match(v):
                job_order_object[k] = {
                    "class": "File",
                    "location": "keep:%s" % v
                }

        adjustFileObjs(job_order_object, keeppathObj)
        adjustDirObjs(job_order_object, keeppathObj)
        normalizeFilesDirs(job_order_object)

        output_name = None
        output_tags = None
        enable_reuse = True
        on_error = "continue"
        if "arv:output_name" in job_order_object:
            output_name = job_order_object["arv:output_name"]
            del job_order_object["arv:output_name"]

        if "arv:output_tags" in job_order_object:
            output_tags = job_order_object["arv:output_tags"]
            del job_order_object["arv:output_tags"]

        if "arv:enable_reuse" in job_order_object:
            enable_reuse = job_order_object["arv:enable_reuse"]
            del job_order_object["arv:enable_reuse"]

        if "arv:on_error" in job_order_object:
            on_error = job_order_object["arv:on_error"]
            del job_order_object["arv:on_error"]

        runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api(
            'v1', model=OrderedJsonModel()),
                                          output_name=output_name,
                                          output_tags=output_tags)

        make_fs_access = functools.partial(
            CollectionFsAccess, collection_cache=runner.collection_cache)

        t = load_tool(toolpath,
                      runner.arv_make_tool,
                      fetcher_constructor=functools.partial(
                          CollectionFetcher,
                          api_client=runner.api,
                          fs_access=make_fs_access(""),
                          num_retries=runner.num_retries))

        args = argparse.Namespace()
        args.project_uuid = arvados.current_job()["owner_uuid"]
        args.enable_reuse = enable_reuse
        args.on_error = on_error
        args.submit = False
        args.debug = False
        args.quiet = False
        args.ignore_docker_for_reuse = False
        args.basedir = os.getcwd()
        args.name = None
        args.cwl_runner_job = {
            "uuid": arvados.current_job()["uuid"],
            "state": arvados.current_job()["state"]
        }
        args.make_fs_access = make_fs_access
        args.trash_intermediate = False
        args.intermediate_output_ttl = 0

        runner.arv_executor(t, job_order_object, **vars(args))
    except Exception as e:
        if isinstance(e, WorkflowException):
            logging.info("Workflow error %s", e)
        else:
            logging.exception("Unhandled exception")
        if runner and runner.final_output_collection:
            outputCollection = runner.final_output_collection.portable_data_hash(
            )
        else:
            outputCollection = None
        api.job_tasks().update(uuid=arvados.current_task()['uuid'],
                               body={
                                   'output': outputCollection,
                                   'success': False,
                                   'progress': 1.0
                               }).execute()
 def setUpClass(cls):
     super(CollectionBenchmark, cls).setUpClass()
     run_test_server.authorize_with('active')
     cls.api_client = arvados.api('v1')
     cls.keep_client = arvados.KeepClient(api_client=cls.api_client,
                                          local_store=cls.local_store)
Example #41
0
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import arvados
import json
import sys

j = json.load(open(sys.argv[1]))

apiA = arvados.api(host=j["arvados_api_hosts"][0],
                   token=j["superuser_tokens"][0],
                   insecure=True)
apiB = arvados.api(host=j["arvados_api_hosts"][1],
                   token=j["superuser_tokens"][1],
                   insecure=True)
apiC = arvados.api(host=j["arvados_api_hosts"][2],
                   token=j["superuser_tokens"][2],
                   insecure=True)


def maketoken(newtok):
    return 'v2/' + newtok["uuid"] + '/' + newtok["api_token"]


def get_user_data(case_nr, is_active=True):
    return {
        "email": "case{}@test".format(case_nr),
        "first_name": "Case{}".format(case_nr),
        "last_name": "Testuser",
        "is_active": is_active
Example #42
0
def main(args, stdout, stderr, api_client=None, keep_client=None):
    parser = arg_parser()

    job_order_object = None
    arvargs = parser.parse_args(args)

    if arvargs.version:
        print versionstring()
        return

    if arvargs.update_workflow:
        if arvargs.update_workflow.find('-7fd4e-') == 5:
            want_api = 'containers'
        elif arvargs.update_workflow.find('-p5p6p-') == 5:
            want_api = 'jobs'
        else:
            want_api = None
        if want_api and arvargs.work_api and want_api != arvargs.work_api:
            logger.error(
                '--update-workflow arg {!r} uses {!r} API, but --api={!r} specified'
                .format(arvargs.update_workflow, want_api, arvargs.work_api))
            return 1
        arvargs.work_api = want_api

    if (arvargs.create_workflow
            or arvargs.update_workflow) and not arvargs.job_order:
        job_order_object = ({}, "")

    add_arv_hints()

    try:
        if api_client is None:
            api_client = arvados.api('v1', model=OrderedJsonModel())
        if keep_client is None:
            keep_client = arvados.keep.KeepClient(api_client=api_client,
                                                  num_retries=4)
        runner = ArvCwlRunner(api_client,
                              work_api=arvargs.work_api,
                              keep_client=keep_client,
                              num_retries=4,
                              output_name=arvargs.output_name,
                              output_tags=arvargs.output_tags)
    except Exception as e:
        logger.error(e)
        return 1

    if arvargs.debug:
        logger.setLevel(logging.DEBUG)
        logging.getLogger('arvados').setLevel(logging.DEBUG)

    if arvargs.quiet:
        logger.setLevel(logging.WARN)
        logging.getLogger('arvados').setLevel(logging.WARN)
        logging.getLogger('arvados.arv-run').setLevel(logging.WARN)

    if arvargs.metrics:
        metrics.setLevel(logging.DEBUG)
        logging.getLogger("cwltool.metrics").setLevel(logging.DEBUG)

    if arvargs.log_timestamps:
        arvados.log_handler.setFormatter(
            logging.Formatter(
                '%(asctime)s %(name)s %(levelname)s: %(message)s',
                '%Y-%m-%d %H:%M:%S'))
    else:
        arvados.log_handler.setFormatter(
            logging.Formatter('%(name)s %(levelname)s: %(message)s'))

    arvargs.conformance_test = None
    arvargs.use_container = True
    arvargs.relax_path_checks = True
    arvargs.print_supported_versions = False

    make_fs_access = partial(CollectionFsAccess,
                             collection_cache=runner.collection_cache)

    return cwltool.main.main(args=arvargs,
                             stdout=stdout,
                             stderr=stderr,
                             executor=runner.arv_executor,
                             makeTool=runner.arv_make_tool,
                             versionfunc=versionstring,
                             job_order_object=job_order_object,
                             make_fs_access=make_fs_access,
                             fetcher_constructor=partial(
                                 CollectionFetcher,
                                 api_client=api_client,
                                 fs_access=make_fs_access(""),
                                 num_retries=runner.num_retries),
                             resolver=partial(collectionResolver,
                                              api_client,
                                              num_retries=runner.num_retries),
                             logger_handler=arvados.log_handler,
                             custom_schema_callback=add_arv_hints)
Example #43
0
    def arv_executor(self, tool, job_order, **kwargs):
        self.debug = kwargs.get("debug")

        tool.visit(self.check_features)

        self.project_uuid = kwargs.get("project_uuid")
        self.pipeline = None
        make_fs_access = kwargs.get("make_fs_access") or partial(
            CollectionFsAccess, collection_cache=self.collection_cache)
        self.fs_access = make_fs_access(kwargs["basedir"])

        self.trash_intermediate = kwargs["trash_intermediate"]
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = kwargs["intermediate_output_ttl"]
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if not kwargs.get("name"):
            kwargs["name"] = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        override_tools = {}
        upload_workflow_deps(self, tool, override_tools)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  makeTool=self.arv_make_tool,
                                  loader=tool.doc_loader,
                                  avsc_names=tool.doc_schema,
                                  metadata=tool.metadata,
                                  override_tools=override_tools)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % kwargs["name"], tool,
                                     job_order)

        existing_uuid = kwargs.get("update_workflow")
        if existing_uuid or kwargs.get("create_workflow"):
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"])
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs["name"]), "success")

        self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")

        kwargs["make_fs_access"] = make_fs_access
        kwargs["enable_reuse"] = kwargs.get("enable_reuse")
        kwargs["use_container"] = True
        kwargs["tmpdir_prefix"] = "tmp"
        kwargs["compute_checksum"] = kwargs.get("compute_checksum")

        if self.work_api == "containers":
            kwargs["outdir"] = "/var/spool/cwl"
            kwargs["docker_outdir"] = "/var/spool/cwl"
            kwargs["tmpdir"] = "/tmp"
            kwargs["docker_tmpdir"] = "/tmp"
        elif self.work_api == "jobs":
            kwargs["outdir"] = "$(task.outdir)"
            kwargs["docker_outdir"] = "$(task.outdir)"
            kwargs["tmpdir"] = "$(task.tmpdir)"

        runnerjob = None
        if kwargs.get("submit"):
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool["class"] == "CommandLineTool" and kwargs.get(
                        "wait"):
                    kwargs["runnerjob"] = tool.tool["id"]
                    runnerjob = tool.job(job_order, self.output_callback,
                                         **kwargs).next()
                else:
                    runnerjob = RunnerContainer(
                        self,
                        tool,
                        job_order,
                        kwargs.get("enable_reuse"),
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=kwargs.get("submit_runner_ram"),
                        name=kwargs.get("name"),
                        on_error=kwargs.get("on_error"),
                        submit_runner_image=kwargs.get("submit_runner_image"),
                        intermediate_output_ttl=kwargs.get(
                            "intermediate_output_ttl"))
            elif self.work_api == "jobs":
                runnerjob = RunnerJob(
                    self,
                    tool,
                    job_order,
                    kwargs.get("enable_reuse"),
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=kwargs.get("submit_runner_ram"),
                    name=kwargs.get("name"),
                    on_error=kwargs.get("on_error"),
                    submit_runner_image=kwargs.get("submit_runner_image"))
        elif "cwl_runner_job" not in kwargs and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    kwargs["name"] if kwargs.
                    get("name") else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runnerjob and not kwargs.get("wait"):
            runnerjob.run(wait=kwargs.get("wait"))
            return (runnerjob.uuid, "success")

        self.poll_api = arvados.api('v1')
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        if runnerjob:
            jobiter = iter((runnerjob, ))
        else:
            if "cwl_runner_job" in kwargs:
                self.uuid = kwargs.get("cwl_runner_job").get('uuid')
            jobiter = tool.job(job_order, self.output_callback, **kwargs)

        try:
            self.cond.acquire()
            # Will continue to hold the lock for the duration of this code
            # except when in cond.wait(), at which point on_message can update
            # job state and process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if runnable:
                    with Perf(metrics, "run"):
                        runnable.run(**kwargs)
                else:
                    if self.processes:
                        self.cond.wait(1)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs."
                        )
                        break
                loopperf.__enter__()
            loopperf.__exit__()

            while self.processes:
                self.cond.wait(1)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt:
                logger.error("Interrupted, marking pipeline as failed")
            else:
                logger.error(
                    "Execution failed: %s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runnerjob and runnerjob.uuid and self.work_api == "containers":
                self.api.container_requests().update(
                    uuid=runnerjob.uuid, body={
                        "priority": "0"
                    }).execute(num_retries=self.num_retries)
        finally:
            self.cond.release()
            self.stop_polling.set()
            self.polling_thread.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if kwargs.get("submit") and isinstance(runnerjob, Runner):
            logger.info("Final output collection %s", runnerjob.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, self.output_tags, self.final_output)
            self.set_crunch_output()

        if kwargs.get("compute_checksum"):
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
def _get_api_client():
    import arvados
    return arvados.api("v1")
Example #45
0
    def __init__(self,
                 api_client=None,
                 proxy=None,
                 timeout=300,
                 api_token=None,
                 local_store=None,
                 block_cache=None,
                 num_retries=0):
        """Initialize a new KeepClient.

        Arguments:
        * api_client: The API client to use to find Keep services.  If not
          provided, KeepClient will build one from available Arvados
          configuration.
        * proxy: If specified, this KeepClient will send requests to this
          Keep proxy.  Otherwise, KeepClient will fall back to the setting
          of the ARVADOS_KEEP_PROXY configuration setting.  If you want to
          ensure KeepClient does not use a proxy, pass in an empty string.
        * timeout: The timeout for all HTTP requests, in seconds.  Default
          300.
        * api_token: If you're not using an API client, but only talking
          directly to a Keep proxy, this parameter specifies an API token
          to authenticate Keep requests.  It is an error to specify both
          api_client and api_token.  If you specify neither, KeepClient
          will use one available from the Arvados configuration.
        * local_store: If specified, this KeepClient will bypass Keep
          services, and save data to the named directory.  If unspecified,
          KeepClient will fall back to the setting of the $KEEP_LOCAL_STORE
          environment variable.  If you want to ensure KeepClient does not
          use local storage, pass in an empty string.  This is primarily
          intended to mock a server for testing.
        * num_retries: The default number of times to retry failed requests.
          This will be used as the default num_retries value when get() and
          put() are called.  Default 0.
        """
        self.lock = threading.Lock()
        if proxy is None:
            proxy = config.get('ARVADOS_KEEP_PROXY')
        if api_token is None:
            api_token = config.get('ARVADOS_API_TOKEN')
        elif api_client is not None:
            raise ValueError(
                "can't build KeepClient with both API client and token")
        if local_store is None:
            local_store = os.environ.get('KEEP_LOCAL_STORE')

        self.block_cache = block_cache if block_cache else KeepBlockCache()

        if local_store:
            self.local_store = local_store
            self.get = self.local_store_get
            self.put = self.local_store_put
        else:
            self.timeout = timeout
            self.num_retries = num_retries
            if proxy:
                if not proxy.endswith('/'):
                    proxy += '/'
                self.api_token = api_token
                self.service_roots = [proxy]
                self.using_proxy = True
                self.static_service_roots = True
            else:
                # It's important to avoid instantiating an API client
                # unless we actually need one, for testing's sake.
                if api_client is None:
                    api_client = arvados.api('v1')
                self.api_client = api_client
                self.api_token = api_client.api_token
                self.service_roots = None
                self.using_proxy = None
                self.static_service_roots = False
Example #46
0
def main(fastq_project, workflows_project, metagenome_workflow_uuid, pangenome_workflow_uuid, pangenome_result_col_uuid): 
    logging.info("Starting a analysis run")

    api = arvados.api('v1', host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN)
    col = arvados.collection.Collection(api_client=api)
    state = {}
    if os.path.exists('state.json'):
        state = json.loads(open('state.json').read())
    reads = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", fastq_project]])
    pangenome_data = []
    report_data = {'kraken': [], 'mlst': [], 'resistome': [], 'virulome': [], 'prokka': []}
    update_pangenome = False
    for it in reads[1:]:
        col = api.collections().get(uuid=it['uuid']).execute()
        if 'sequence_label' not in it['properties']:
            continue
        sample_id = it['properties']['sequence_label']
        if 'analysis_status' in it['properties']:
            pangenome_data.append((sample_id, col['portable_data_hash']))
            col_reader = CollectionReader(col['uuid'])
            report_data['kraken'].append((sample_id, get_kraken_report(col_reader)))
            report_data['mlst'].append((sample_id, get_mlst_report(col_reader)))
            report_data['resistome'].append((sample_id, get_resistome_report(col_reader)))
            report_data['virulome'].append((sample_id, get_virulome_report(col_reader)))
            report_data['prokka'].append((sample_id, get_prokka_report(col_reader)))
        continue
        if sample_id not in state:
            state[sample_id] = {
                'status': 'new',
                'container_request': None,
                'output_collection': None,
            }
        sample_state = state[sample_id]
        if sample_state['status'] == 'new':
            container_request, status = submit_new_request(
                api, workflows_project, metagenome_workflow_uuid, sample_id,
                it['portable_data_hash'])
            sample_state['status'] = status
            sample_state['container_request'] = container_request
            logging.info('Submitted analysis request for %s', sample_id)
        elif sample_state['status'] == 'submitted':
            # TODO: check container request status
            if sample_state['container_request'] is None:
                raise Exception("Container request cannot be empty when status is submitted")
            cr = api.container_requests().get(
                uuid=sample_state["container_request"]).execute()
            cr_state = get_cr_state(api, cr)
            logging.info('Container request for %s is %s', sample_id, cr_state)
            if cr_state == 'Complete':
                out_col = api.collections().get(uuid=cr["output_uuid"]).execute()
                sample_state['output_collection'] = cr["output_uuid"]
                sample_state['status'] = 'complete'
                # Copy output files to reads collection
                it['properties']['analysis_status'] = 'complete'
                api.collections().update(
                    uuid=it['uuid'],
                    body={"manifest_text": col["manifest_text"] + out_col["manifest_text"],
                          "properties": it["properties"]}).execute()
                pangenome_data.append((sample_id, col['portable_data_hash']))
                update_pangenome = True
            elif cr_state == 'Failed':
                state[sample_id] = {
                    'status': 'new',
                    'container_request': None,
                    'output_collection': None,
        
                }
        elif sample_state['status'] == 'complete':
            # TODO: do nothing
            pass
    if update_pangenome:
        container_request, status = submit_pangenome(api, workflows_project, pangenome_workflow_uuid, pangenome_data)
        if status == 'submitted':
            state['last_pangenome_request'] = container_request
            state['last_pangenome_request_status'] = 'submitted'
            logging.info('Submitted pangenome request %s', container_request)
    else:
        cr = api.container_requests().get(
            uuid=state["last_pangenome_request"]).execute()
        cr_state = get_cr_state(api, cr)
        logging.info('Container request for pangenome workflow is %s', cr_state)
        if state['last_pangenome_request_status'] == 'submitted' and cr_state == 'Complete':
            logging.info('Updating results collection')
            out_col = api.collections().get(uuid=cr["output_uuid"]).execute()
            api.collections().update(
                uuid=pangenome_result_col_uuid,
                body={"manifest_text": out_col["manifest_text"]}).execute()
            state['last_pangenome_request_status'] = 'complete'

    col_reader = CollectionReader(pangenome_result_col_uuid)
    report_data["iqtree"] = get_iqtree_result(col_reader)
    report_data["roary_svg"] = get_roary_svg(col_reader)
    report_data["roary_stats"] = get_roary_stats(col_reader)
    generate_report(report_data)
    
    with open('state.json', 'w') as f:
        f.write(json.dumps(state))
Example #47
0
 def test_empty_list(self):
     answer = arvados.api('v1').humans().list(
         filters=[['uuid', '=', None]]).execute()
     self.assertEqual(answer['items_available'], len(answer['items']))
Example #48
0
    def arv_executor(self, tool, job_order, runtimeContext, logger=None):
        self.debug = runtimeContext.debug

        tool.visit(self.check_features)

        self.project_uuid = runtimeContext.project_uuid
        self.pipeline = None
        self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
        self.secret_store = runtimeContext.secret_store

        self.trash_intermediate = runtimeContext.trash_intermediate
        if self.trash_intermediate and self.work_api != "containers":
            raise Exception(
                "--trash-intermediate is only supported with --api=containers."
            )

        self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
        if self.intermediate_output_ttl and self.work_api != "containers":
            raise Exception(
                "--intermediate-output-ttl is only supported with --api=containers."
            )
        if self.intermediate_output_ttl < 0:
            raise Exception(
                "Invalid value %d for --intermediate-output-ttl, cannot be less than zero"
                % self.intermediate_output_ttl)

        if runtimeContext.submit_request_uuid and self.work_api != "containers":
            raise Exception(
                "--submit-request-uuid requires containers API, but using '{}' api"
                .format(self.work_api))

        if not runtimeContext.name:
            runtimeContext.name = self.name = tool.tool.get(
                "label") or tool.metadata.get("label") or os.path.basename(
                    tool.tool["id"])

        # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
        # Also uploads docker images.
        merged_map = upload_workflow_deps(self, tool)

        # Reload tool object which may have been updated by
        # upload_workflow_deps
        # Don't validate this time because it will just print redundant errors.
        loadingContext = self.loadingContext.copy()
        loadingContext.loader = tool.doc_loader
        loadingContext.avsc_names = tool.doc_schema
        loadingContext.metadata = tool.metadata
        loadingContext.do_validate = False

        tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
                                  loadingContext)

        # Upload local file references in the job order.
        job_order = upload_job_order(self, "%s input" % runtimeContext.name,
                                     tool, job_order)

        existing_uuid = runtimeContext.update_workflow
        if existing_uuid or runtimeContext.create_workflow:
            # Create a pipeline template or workflow record and exit.
            if self.work_api == "jobs":
                tmpl = RunnerTemplate(
                    self,
                    tool,
                    job_order,
                    runtimeContext.enable_reuse,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map,
                    loadingContext=loadingContext)
                tmpl.save()
                # cwltool.main will write our return value to stdout.
                return (tmpl.uuid, "success")
            elif self.work_api == "containers":
                return (upload_workflow(
                    self,
                    tool,
                    job_order,
                    self.project_uuid,
                    uuid=existing_uuid,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    merged_map=merged_map), "success")

        self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
        self.eval_timeout = runtimeContext.eval_timeout

        runtimeContext = runtimeContext.copy()
        runtimeContext.use_container = True
        runtimeContext.tmpdir_prefix = "tmp"
        runtimeContext.work_api = self.work_api

        if self.work_api == "containers":
            if self.ignore_docker_for_reuse:
                raise Exception(
                    "--ignore-docker-for-reuse not supported with containers API."
                )
            runtimeContext.outdir = "/var/spool/cwl"
            runtimeContext.docker_outdir = "/var/spool/cwl"
            runtimeContext.tmpdir = "/tmp"
            runtimeContext.docker_tmpdir = "/tmp"
        elif self.work_api == "jobs":
            if runtimeContext.priority != DEFAULT_PRIORITY:
                raise Exception("--priority not implemented for jobs API.")
            runtimeContext.outdir = "$(task.outdir)"
            runtimeContext.docker_outdir = "$(task.outdir)"
            runtimeContext.tmpdir = "$(task.tmpdir)"

        if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
            raise Exception("--priority must be in the range 1..1000.")

        if self.should_estimate_cache_size:
            visited = set()
            estimated_size = [0]

            def estimate_collection_cache(obj):
                if obj.get("location", "").startswith("keep:"):
                    m = pdh_size.match(obj["location"][5:])
                    if m and m.group(1) not in visited:
                        visited.add(m.group(1))
                        estimated_size[0] += int(m.group(2))

            visit_class(job_order, ("File", "Directory"),
                        estimate_collection_cache)
            runtimeContext.collection_cache_size = max(
                ((estimated_size[0] * 192) / (1024 * 1024)) + 1, 256)
            self.collection_cache.set_cap(
                runtimeContext.collection_cache_size * 1024 * 1024)

        logger.info("Using collection cache size %s MiB",
                    runtimeContext.collection_cache_size)

        runnerjob = None
        if runtimeContext.submit:
            # Submit a runner job to run the workflow for us.
            if self.work_api == "containers":
                if tool.tool[
                        "class"] == "CommandLineTool" and runtimeContext.wait and (
                            not runtimeContext.always_submit_runner):
                    runtimeContext.runnerjob = tool.tool["id"]
                else:
                    tool = RunnerContainer(
                        self,
                        tool,
                        loadingContext,
                        runtimeContext.enable_reuse,
                        self.output_name,
                        self.output_tags,
                        submit_runner_ram=runtimeContext.submit_runner_ram,
                        name=runtimeContext.name,
                        on_error=runtimeContext.on_error,
                        submit_runner_image=runtimeContext.submit_runner_image,
                        intermediate_output_ttl=runtimeContext.
                        intermediate_output_ttl,
                        merged_map=merged_map,
                        priority=runtimeContext.priority,
                        secret_store=self.secret_store,
                        collection_cache_size=runtimeContext.
                        collection_cache_size,
                        collection_cache_is_default=self.
                        should_estimate_cache_size)
            elif self.work_api == "jobs":
                tool = RunnerJob(
                    self,
                    tool,
                    loadingContext,
                    runtimeContext.enable_reuse,
                    self.output_name,
                    self.output_tags,
                    submit_runner_ram=runtimeContext.submit_runner_ram,
                    name=runtimeContext.name,
                    on_error=runtimeContext.on_error,
                    submit_runner_image=runtimeContext.submit_runner_image,
                    merged_map=merged_map)
        elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
            # Create pipeline for local run
            self.pipeline = self.api.pipeline_instances().create(
                body={
                    "owner_uuid":
                    self.project_uuid,
                    "name":
                    runtimeContext.name if runtimeContext.
                    name else shortname(tool.tool["id"]),
                    "components": {},
                    "state":
                    "RunningOnClient"
                }).execute(num_retries=self.num_retries)
            logger.info("Pipeline instance %s", self.pipeline["uuid"])

        if runtimeContext.cwl_runner_job is not None:
            self.uuid = runtimeContext.cwl_runner_job.get('uuid')

        jobiter = tool.job(job_order, self.output_callback, runtimeContext)

        if runtimeContext.submit and not runtimeContext.wait:
            runnerjob = jobiter.next()
            runnerjob.run(runtimeContext)
            return (runnerjob.uuid, "success")

        current_container = arvados_cwl.util.get_current_container(
            self.api, self.num_retries, logger)
        if current_container:
            logger.info("Running inside container %s",
                        current_container.get("uuid"))

        self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
        self.polling_thread = threading.Thread(target=self.poll_states)
        self.polling_thread.start()

        self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)

        try:
            self.workflow_eval_lock.acquire()

            # Holds the lock while this code runs and releases it when
            # it is safe to do so in self.workflow_eval_lock.wait(),
            # at which point on_message can update job state and
            # process output callbacks.

            loopperf = Perf(metrics, "jobiter")
            loopperf.__enter__()
            for runnable in jobiter:
                loopperf.__exit__()

                if self.stop_polling.is_set():
                    break

                if self.task_queue.error is not None:
                    raise self.task_queue.error

                if runnable:
                    with Perf(metrics, "run"):
                        self.start_run(runnable, runtimeContext)
                else:
                    if (self.task_queue.in_flight + len(self.processes)) > 0:
                        self.workflow_eval_lock.wait(3)
                    else:
                        logger.error(
                            "Workflow is deadlocked, no runnable processes and not waiting on any pending processes."
                        )
                        break

                if self.stop_polling.is_set():
                    break

                loopperf.__enter__()
            loopperf.__exit__()

            while (self.task_queue.in_flight + len(self.processes)) > 0:
                if self.task_queue.error is not None:
                    raise self.task_queue.error
                self.workflow_eval_lock.wait(3)

        except UnsupportedRequirement:
            raise
        except:
            if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info(
            )[0] is SystemExit:
                logger.error("Interrupted, workflow will be cancelled")
            else:
                logger.error(
                    "Execution failed:\n%s",
                    sys.exc_info()[1],
                    exc_info=(sys.exc_info()[1] if self.debug else False))
            if self.pipeline:
                self.api.pipeline_instances().update(
                    uuid=self.pipeline["uuid"], body={
                        "state": "Failed"
                    }).execute(num_retries=self.num_retries)
            if runtimeContext.submit and isinstance(tool, Runner):
                runnerjob = tool
                if runnerjob.uuid and self.work_api == "containers":
                    self.api.container_requests().update(
                        uuid=runnerjob.uuid, body={
                            "priority": "0"
                        }).execute(num_retries=self.num_retries)
        finally:
            self.workflow_eval_lock.release()
            self.task_queue.drain()
            self.stop_polling.set()
            self.polling_thread.join()
            self.task_queue.join()

        if self.final_status == "UnsupportedRequirement":
            raise UnsupportedRequirement("Check log for details.")

        if self.final_output is None:
            raise WorkflowException("Workflow did not return a result.")

        if runtimeContext.submit and isinstance(tool, Runner):
            logger.info("Final output collection %s", tool.final_output)
        else:
            if self.output_name is None:
                self.output_name = "Output of %s" % (shortname(
                    tool.tool["id"]))
            if self.output_tags is None:
                self.output_tags = ""

            storage_classes = runtimeContext.storage_classes.strip().split(",")
            self.final_output, self.final_output_collection = self.make_output_collection(
                self.output_name, storage_classes, self.output_tags,
                self.final_output)
            self.set_crunch_output()

        if runtimeContext.compute_checksum:
            adjustDirObjs(self.final_output,
                          partial(get_listing, self.fs_access))
            adjustFileObjs(self.final_output,
                           partial(compute_checksums, self.fs_access))

        if self.trash_intermediate and self.final_status == "success":
            self.trash_intermediate_output()

        return (self.final_output, self.final_status)
Example #49
0
 def setUpClass(cls):
     super(KeepProxyTestCase, cls).setUpClass()
     run_test_server.authorize_with('active')
     cls.api_client = arvados.api('v1')
Example #50
0
def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
    args = parse_arguments(arguments)

    api = arvados.api('v1')

    now = datetime.datetime.utcnow()
    start_time = args.start or api_timestamp(now - datetime.timedelta(days=1))
    end_time = args.end or api_timestamp(now)

    # Find all jobs created within the specified window,
    # and their corresponding job logs.
    jobs_created = jobs_created_between_dates(api, start_time, end_time)
    jobs_by_state = {}
    for job in jobs_created:
        jobs_by_state.setdefault(job['state'], [])
        jobs_by_state[job['state']].append(job)

    # Find failed jobs and record the job failure text.

    # failure_stats maps failure types (e.g. "sys/docker") to
    # a set of job UUIDs that failed for that reason.
    failure_stats = {}
    for job in jobs_by_state['Failed']:
        job_uuid = job['uuid']
        logs = job_logs(api, job)
        # Find the first permanent task failure, and collect the
        # preceding log lines.
        failure_type = None
        for i, lg in enumerate(logs):
            if is_failed_task(lg):
                # Get preceding log record to provide context.
                log_start = i - LOG_CONTEXT_LINES if i >= LOG_CONTEXT_LINES else 0
                log_end = i + 1
                lastlogs = ''.join(logs[log_start:log_end])
                # try to identify the type of failure.
                for key, rgx in JOB_FAILURE_TYPES.iteritems():
                    if re.search(rgx, lastlogs):
                        failure_type = key
                        break
            if failure_type is not None:
                break
        if failure_type is None:
            failure_type = 'unknown'
        failure_stats.setdefault(failure_type, set())
        failure_stats[failure_type].add(job_uuid)

    # Report percentages of successful, failed and unfinished jobs.
    print "Start: {:20s}".format(start_time)
    print "End:   {:20s}".format(end_time)
    print ""

    print "Overview"
    print ""

    job_start_count = len(jobs_created)
    print "  {: <25s} {:4d}".format('Started', job_start_count)
    for state in ['Complete', 'Failed', 'Queued', 'Cancelled', 'Running']:
        if state in jobs_by_state:
            job_count = len(jobs_by_state[state])
            job_percentage = job_count / float(job_start_count)
            print "  {: <25s} {:4d} ({: >4.0%})".format(state,
                                                        job_count,
                                                        job_percentage)
    print ""

    # Report failure types.
    failure_summary = ""
    failure_detail = ""

    # Generate a mapping from failed job uuids to job records, to assist
    # in generating detailed statistics for job failures.
    jobs_failed_map = { job['uuid']: job for job in jobs_by_state.get('Failed', []) }

    # sort the failure stats in descending order by occurrence.
    sorted_failures = sorted(failure_stats,
                             reverse=True,
                             key=lambda failure_type: len(failure_stats[failure_type]))
    for failtype in sorted_failures:
        job_uuids = failure_stats[failtype]
        failstat = "  {: <25s} {:4d} ({: >4.0%})\n".format(
            failtype,
            len(job_uuids),
            len(job_uuids) / float(len(jobs_by_state['Failed'])))
        failure_summary = failure_summary + failstat
        failure_detail = failure_detail + failstat
        for j in job_uuids:
            job_info = jobs_failed_map[j]
            job_owner = job_user_name(api, job_info['modified_by_user_uuid'])
            job_name = job_pipeline_name(api, job_info['uuid'])
            failure_detail = failure_detail + "    {}  {: <15.15s}  {:29.29s}\n".format(j, job_owner, job_name)
        failure_detail = failure_detail + "\n"

    print "Failures by class"
    print ""
    print failure_summary

    print "Failures by class (detail)"
    print ""
    print failure_detail

    return 0
Example #51
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh,
                                                   job_input_pdh,
                                                   interval_lists_pdh,
                                                   group_by_regex,
                                                   max_gvcfs_to_combine,
                                                   if_sequence=0,
                                                   and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(
        interval_count,
        validate_task_output,
        reuse_tasks=True,
        oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
        if_sequence=1,
        and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert (this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join(
                [interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError(
                "Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file,
                                   gvcf_files,
                                   os.path.join(out_dir, out_file),
                                   extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
Example #52
0
 def setUpClass(cls):
     super(KeepOptionalPermission, cls).setUpClass()
     run_test_server.authorize_with("admin")
     cls.api_client = arvados.api('v1')
Example #53
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params = [
            'inputs', 'ref', 'name'
        ]  # N.B. inputs collection includes input vcfs and corresponding interval_list
        script = "gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse = '6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(
            1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params,
                                                    reusable_tasks,
                                                    task_key_params,
                                                    validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(
            ref_input_pdh,
            job_input_pdh,
            interval_lists_pdh,
            group_by_regex,
            if_sequence=0,
            and_end_task=True,
            create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file,
                                    interval_list_file,
                                    gvcf_files,
                                    os.path.join(out_dir, out_file),
                                    cores="4",
                                    java_mem="19g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
Example #54
0
def main():
    parser = argparse.ArgumentParser(
        description='Upload SARS-CoV-19 sequences for analysis')
    parser.add_argument('metadata',
                        type=argparse.FileType('r'),
                        help='sequence metadata json')
    parser.add_argument('sequence_p1',
                        type=argparse.FileType('rb'),
                        default=None,
                        nargs='?',
                        help='sequence FASTA/FASTQ')
    parser.add_argument('sequence_p2',
                        type=argparse.FileType('rb'),
                        default=None,
                        nargs='?',
                        help='sequence FASTQ pair')
    parser.add_argument("--validate",
                        action="store_true",
                        help="Dry run, validate only")
    parser.add_argument("--skip-qc",
                        action="store_true",
                        help="Skip local qc check")
    parser.add_argument(
        "--trusted",
        action="store_true",
        help="Trust local validation and add directly to validated project")
    args = parser.parse_args()

    if args.trusted:
        # Use credentials from environment
        api = arvados.api()
    else:
        api = arvados.api(host=ARVADOS_API_HOST,
                          token=UPLOADER_API_TOKEN,
                          insecure=True)

    # ---- First the QC
    target = qc_stuff(args.metadata, args.sequence_p1, args.sequence_p2,
                      not args.skip_qc)
    if target:
        seqlabel = target[0][1]
    else:
        seqlabel = ""

    if args.validate:
        log.info("Valid")
        exit(0)

    col = arvados.collection.Collection(api_client=api)

    # ---- Upload the sequence to Arvados
    if args.sequence_p1:
        upload_sequence(col, target[0], args.sequence_p1)
        if args.sequence_p2:
            upload_sequence(col, target[1], args.sequence_p2)

    # ---- Make sure the metadata YAML is valid
    log.info("Reading metadata")
    with col.open("metadata.yaml", "w") as f:
        r = args.metadata.read(65536)
        log.info(r[0:20])
        while r:
            f.write(r)
            r = args.metadata.read(65536)

    # ---- Get the uploader IP address (gateway) and local user info
    external_ip = urllib.request.urlopen('https://ident.me').read().decode(
        'utf8')

    try:
        username = getpass.getuser()
    except KeyError:
        username = "******"

    properties = {
        "sequence_label": seqlabel,
        "upload_app": "bh20-seq-uploader",
        "upload_ip": external_ip,
        "upload_user": "******" % (username, socket.gethostname())
    }

    # ---- Get ready for actual uploading
    api2 = arvados.api(host=ARVADOS_API_HOST,
                       token=ANONYMOUS_API_TOKEN,
                       insecure=True)
    dup = api2.collections().list(
        filters=[["owner_uuid", "in", [VALIDATED_PROJECT, UPLOAD_PROJECT]],
                 ["portable_data_hash", "=",
                  col.portable_data_hash()]]).execute()
    if dup["items"]:
        # This exact collection has been uploaded before.
        log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
        exit(1)

    if args.trusted:
        properties["status"] = "validated"
        owner_uuid = VALIDATED_PROJECT
    else:
        owner_uuid = UPLOAD_PROJECT

    # ---- and stream the 'collection' up
    col.save_new(
        owner_uuid=owner_uuid,
        name="%s uploaded by %s from %s" %
        (seqlabel, properties['upload_user'], properties['upload_ip']),
        properties=properties,
        ensure_unique_name=True)

    log.info("Saved to %s" % col.manifest_locator())
    log.info("Done")
    exit(0)
    if not args.destination:
        if len(args.uuids) < 2:
            parser.error("missing destination project UUID argument")
        args.destination = args.uuids.pop()
    return args


def setup_logging(args):
    log_handler = logging.StreamHandler()
    log_handler.setFormatter(
        logging.Formatter(
            '%(asctime)s %(name)s[%(process)d] %(levelname)s: %(message)s',
            '%Y-%m-%d %H:%M:%S'))
    logger.addHandler(log_handler)
    logger.setLevel(max(1, logging.WARNING - (10 * args.verbose)))


def main(stdin, stdout, stderr, arglist, arv):
    args = parse_arguments(arglist)
    setup_logging(args)
    uuid_mapper = UUIDMapper(arv)
    dependencies = DependencyTracker(uuid_mapper, logger)
    for uuid in args.uuids:
        dependencies.add_object(uuid, arv)
    dependencies.move_to(args.destination, arv, args.request_handler)


if __name__ == '__main__':
    main(sys.stdin, sys.stdout, sys.stderr, sys.argv[1:],
         arvados.api('v1', model=OrderedJsonModel()))
Example #56
0
def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None):
    args = arg_parser.parse_args(arguments)
    if api is None:
        api = arvados.api('v1')

    if args.image is None or args.image == 'images':
        fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
        stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
        try:
            for i, j in list_images_in_arv(api, args.retries):
                stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
        except IOError as e:
            if e.errno == errno.EPIPE:
                pass
            else:
                raise
        sys.exit(0)

    if re.search(r':\w[-.\w]{0,127}$', args.image):
        # image ends with :valid-tag
        if args.tag is not None:
            logger.error(
                "image %r already includes a tag, cannot add tag argument %r",
                args.image, args.tag)
            sys.exit(1)
        # rsplit() accommodates "myrepo.example:8888/repo/image:tag"
        args.image, args.tag = args.image.rsplit(':', 1)
    elif args.tag is None:
        args.tag = 'latest'

    # Pull the image if requested, unless the image is specified as a hash
    # that we already have.
    if args.pull and not find_image_hashes(args.image):
        pull_image(args.image, args.tag)

    try:
        image_hash = find_one_image_hash(args.image, args.tag)
    except DockerError as error:
        logger.error(str(error))
        sys.exit(1)

    if not docker_image_compatible(api, image_hash):
        if args.force_image_format:
            logger.warning("forcing incompatible image")
        else:
            logger.error("refusing to store " \
                "incompatible format (use --force-image-format to override)")
            sys.exit(1)

    image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None

    if args.name is None:
        if image_repo_tag:
            collection_name = 'Docker image {} {}'.format(image_repo_tag.replace("/", " "), image_hash[0:12])
        else:
            collection_name = 'Docker image {}'.format(image_hash[0:12])
    else:
        collection_name = args.name

    # Acquire a lock so that only one arv-keepdocker process will
    # dump/upload a particular docker image at a time.  Do this before
    # checking if the image already exists in Arvados so that if there
    # is an upload already underway, when that upload completes and
    # this process gets a turn, it will discover the Docker image is
    # already available and exit quickly.
    outfile_name = '{}.tar'.format(image_hash)
    lockfile_name = '{}.lock'.format(outfile_name)
    lockfile = None
    cache_dir = get_cache_dir()
    if cache_dir:
        lockfile = open(os.path.join(cache_dir, lockfile_name), 'w+')
        fcntl.flock(lockfile, fcntl.LOCK_EX)

    try:
        if not args.force:
            # Check if this image is already in Arvados.

            # Project where everything should be owned
            parent_project_uuid = args.project_uuid or api.users().current().execute(
                num_retries=args.retries)['uuid']

            # Find image hash tags
            existing_links = _get_docker_links(
                api, args.retries,
                filters=[['link_class', '=', 'docker_image_hash'],
                         ['name', '=', image_hash]])
            if existing_links:
                # get readable collections
                collections = api.collections().list(
                    filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
                    select=["uuid", "owner_uuid", "name", "manifest_text"]
                    ).execute(num_retries=args.retries)['items']

                if collections:
                    # check for repo+tag links on these collections
                    if image_repo_tag:
                        existing_repo_tag = _get_docker_links(
                            api, args.retries,
                            filters=[['link_class', '=', 'docker_image_repo+tag'],
                                     ['name', '=', image_repo_tag],
                                     ['head_uuid', 'in', [c["uuid"] for c in collections]]])
                    else:
                        existing_repo_tag = []

                    try:
                        coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
                    except StopIteration:
                        # create new collection owned by the project
                        coll_uuid = api.collections().create(
                            body={"manifest_text": collections[0]['manifest_text'],
                                  "name": collection_name,
                                  "owner_uuid": parent_project_uuid,
                                  "properties": {"docker-image-repo-tag": image_repo_tag}},
                            ensure_unique_name=True
                            ).execute(num_retries=args.retries)['uuid']

                    link_base = {'owner_uuid': parent_project_uuid,
                                 'head_uuid':  coll_uuid,
                                 'properties': existing_links[0]['properties']}

                    if not any(items_owned_by(parent_project_uuid, existing_links)):
                        # create image link owned by the project
                        make_link(api, args.retries,
                                  'docker_image_hash', image_hash, **link_base)

                    if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
                        # create repo+tag link owned by the project
                        make_link(api, args.retries, 'docker_image_repo+tag',
                                  image_repo_tag, **link_base)

                    stdout.write(coll_uuid + "\n")

                    sys.exit(0)

        # Open a file for the saved image, and write it if needed.
        image_file, need_save = prep_image_file(outfile_name)
        if need_save:
            save_image(image_hash, image_file)

        # Call arv-put with switches we inherited from it
        # (a.k.a., switches that aren't our own).
        if arguments is None:
            arguments = sys.argv[1:]
        arguments = [i for i in arguments if i not in (args.image, args.tag, image_repo_tag)]
        put_args = keepdocker_parser.parse_known_args(arguments)[1]

        if args.name is None:
            put_args += ['--name', collection_name]

        coll_uuid = arv_put.main(
            put_args + ['--filename', outfile_name, image_file.name], stdout=stdout,
            install_sig_handlers=install_sig_handlers).strip()

        # Managed properties could be already set
        coll_properties = api.collections().get(uuid=coll_uuid).execute(num_retries=args.retries).get('properties', {})
        coll_properties.update({"docker-image-repo-tag": image_repo_tag})

        api.collections().update(uuid=coll_uuid, body={"properties": coll_properties}).execute(num_retries=args.retries)

        # Read the image metadata and make Arvados links from it.
        image_file.seek(0)
        image_tar = tarfile.open(fileobj=image_file)
        image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
        if image_hash_type:
            json_filename = raw_image_hash + '.json'
        else:
            json_filename = raw_image_hash + '/json'
        json_file = image_tar.extractfile(image_tar.getmember(json_filename))
        image_metadata = json.loads(json_file.read().decode('utf-8'))
        json_file.close()
        image_tar.close()
        link_base = {'head_uuid': coll_uuid, 'properties': {}}
        if 'created' in image_metadata:
            link_base['properties']['image_timestamp'] = image_metadata['created']
        if args.project_uuid is not None:
            link_base['owner_uuid'] = args.project_uuid

        make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
        if image_repo_tag:
            make_link(api, args.retries,
                      'docker_image_repo+tag', image_repo_tag, **link_base)

        # Clean up.
        image_file.close()
        for filename in [stat_cache_name(image_file), image_file.name]:
            try:
                os.unlink(filename)
            except OSError as error:
                if error.errno != errno.ENOENT:
                    raise
    finally:
        if lockfile is not None:
            # Closing the lockfile unlocks it.
            lockfile.close()
Example #57
0
def run_test(name, actions, checks, driver_class, jobs, provider):
    code = 0
    global unsatisfiable_job_scancelled
    unsatisfiable_job_scancelled = os.path.join(tempfile.mkdtemp(),
                                                "scancel_called")

    # Delete any stale node records
    api = arvados.api('v1')
    for n in api.nodes().list().execute()['items']:
        api.nodes().delete(uuid=n["uuid"]).execute()

    logger.info("Start %s", name)

    global fake_slurm
    fake_slurm = tempfile.mkdtemp()
    detail.info("fake_slurm is %s", fake_slurm)

    global compute_nodes
    compute_nodes = {}

    global all_jobs
    all_jobs = jobs

    env = os.environ.copy()
    env["PATH"] = fake_slurm + ":" + env["PATH"]

    # Reset fake squeue/sinfo to empty
    update_script(os.path.join(fake_slurm, "squeue"), "#!/bin/sh\n")
    update_script(os.path.join(fake_slurm, "sinfo"), "#!/bin/sh\n")

    # Write configuration file for test
    with open("tests/fake_%s.cfg.template" % provider) as f:
        open(os.path.join(fake_slurm, "id_rsa.pub"), "w").close()
        with open(os.path.join(fake_slurm, "fake.cfg"), "w") as cfg:
            cfg.write(f.read().format(host=os.environ["ARVADOS_API_HOST"],
                                      token=os.environ["ARVADOS_API_TOKEN"],
                                      driver_class=driver_class,
                                      ssh_key=os.path.join(fake_slurm, "id_rsa.pub")))

    # Tests must complete in less than 30 seconds.
    timeout = time.time() + 30
    terminated = False

    # Now start node manager
    p = subprocess.Popen(["bin/arvados-node-manager", "--foreground", "--config", os.path.join(fake_slurm, "fake.cfg")],
                         bufsize=0, stderr=subprocess.PIPE, env=env)

    # Test main loop:
    # - Read line
    # - Apply negative checks (things that are not supposed to happen)
    # - Check timeout
    # - Check if the next action should trigger
    # - If all actions are exhausted, terminate with test success
    # - If it hits timeout with actions remaining, terminate with test failed
    try:
        # naive line iteration over pipes gets buffered, which isn't what we want,
        # see https://bugs.python.org/issue3907
        for line in iter(p.stderr.readline, ""):
            detail_content.write(line)

            for k,v in checks.items():
                g = re.match(k, line)
                if g:
                    detail.info("Matched check %s", k)
                    code += v(checks, k, g)
                    if code != 0:
                        detail.error("Check failed")
                        if not terminated:
                            p.kill()
                            terminated = True

            if terminated:
                continue

            if time.time() > timeout:
                detail.error("Exceeded timeout with actions remaining: %s", actions)
                code += 1
                if not terminated:
                    p.kill()
                    terminated = True

            k, v = actions[0]
            g = re.match(k, line)
            if g:
                detail.info("Matched action %s", k)
                actions.pop(0)
                code += v(g)
                if code != 0:
                    detail.error("Action failed")
                    p.kill()
                    terminated = True

            if not actions:
                p.kill()
                terminated = True
    except KeyboardInterrupt:
        p.kill()

    if actions:
        detail.error("Ended with remaining actions: %s", actions)
        code = 1

    shutil.rmtree(fake_slurm)
    shutil.rmtree(os.path.dirname(unsatisfiable_job_scancelled))

    if code == 0:
        logger.info("%s passed", name)
    else:
        if isinstance(detail_content, StringIO.StringIO):
            detail_content.seek(0)
            chunk = detail_content.read(4096)
            while chunk:
                try:
                    sys.stderr.write(chunk)
                    chunk = detail_content.read(4096)
                except IOError as e:
                    if e.errno == errno.EAGAIN:
                        # try again (probably pipe buffer full)
                        pass
                    else:
                        raise
        logger.info("%s failed", name)

    return code
Example #58
0
def main(arguments=None):
    args = arvrun_parser.parse_args(arguments)

    if len(args.args) == 0:
        arvrun_parser.print_help()
        return

    starting_args = args.args

    reading_into = 2

    # Parse the command arguments into 'slots'.
    # All words following '>' are output arguments and are collected into slots[0].
    # All words following '<' are input arguments and are collected into slots[1].
    # slots[2..] store the parameters of each command in the pipeline.
    #
    # e.g. arv-run foo arg1 arg2 '|' bar arg3 arg4 '<' input1 input2 input3 '>' output.txt
    # will be parsed into:
    #   [['output.txt'],
    #    ['input1', 'input2', 'input3'],
    #    ['foo', 'arg1', 'arg2'],
    #    ['bar', 'arg3', 'arg4']]
    slots = [[], [], []]
    for c in args.args:
        if c.startswith('>'):
            reading_into = 0
            if len(c) > 1:
                slots[reading_into].append(c[1:])
        elif c.startswith('<'):
            reading_into = 1
            if len(c) > 1:
                slots[reading_into].append(c[1:])
        elif c == '|':
            reading_into = len(slots)
            slots.append([])
        else:
            slots[reading_into].append(c)

    if slots[0] and len(slots[0]) > 1:
        logger.error(
            "Can only specify a single stdout file (run-command substitutions are permitted)"
        )
        return

    if not args.dry_run:
        api = arvados.api('v1')
        if args.project_uuid:
            project = args.project_uuid
        else:
            project = determine_project(
                os.getcwd(),
                api.users().current().execute()["uuid"])

    # Identify input files.  Look at each parameter and test to see if there is
    # a file by that name.  This uses 'patterns' to look for within
    # command line arguments, such as --foo=file.txt or -lfile.txt
    patterns = [re.compile("([^=]+=)(.*)"), re.compile("(-[A-Za-z])(.+)")]
    for j, command in enumerate(slots[1:]):
        for i, a in enumerate(command):
            if j > 0 and i == 0:
                # j == 0 is stdin, j > 0 is commands
                # always skip program executable (i == 0) in commands
                pass
            elif a.startswith('\\'):
                # if it starts with a \ then don't do any interpretation
                command[i] = a[1:]
            else:
                # See if it looks like a file
                command[i] = statfile('', a)

                # If a file named command[i] was found, it would now be an
                # ArvFile or UploadFile.  If command[i] is a basestring, that
                # means it doesn't correspond exactly to a file, so do some
                # pattern matching.
                if isinstance(command[i], basestring):
                    for p in patterns:
                        m = p.match(a)
                        if m:
                            command[i] = statfile(m.group(1), m.group(2))
                            break

    files = [
        c for command in slots[1:] for c in command
        if isinstance(c, UploadFile)
    ]
    if files:
        uploadfiles(files,
                    api,
                    dry_run=args.dry_run,
                    num_retries=args.retries,
                    project=project)

    for i in range(1, len(slots)):
        slots[i] = [
            ("%s%s" % (c.prefix, c.fn)) if isinstance(c, ArvFile) else c
            for c in slots[i]
        ]

    component = {
        "script": "run-command",
        "script_version": args.script_version,
        "repository": args.repository,
        "script_parameters": {},
        "runtime_constraints": {}
    }

    if args.docker_image:
        component["runtime_constraints"]["docker_image"] = args.docker_image

    task_foreach = []
    group_parser = argparse.ArgumentParser()
    group_parser.add_argument('-b', '--batch-size', type=int)
    group_parser.add_argument('args', nargs=argparse.REMAINDER)

    for s in range(2, len(slots)):
        for i in range(0, len(slots[s])):
            if slots[s][i] == '--':
                inp = "input%i" % (s - 2)
                groupargs = group_parser.parse_args(slots[2][i + 1:])
                if groupargs.batch_size:
                    component["script_parameters"][inp] = {
                        "value": {
                            "batch": groupargs.args,
                            "size": groupargs.batch_size
                        }
                    }
                    slots[s] = slots[s][0:i] + [{
                        "foreach": inp,
                        "command": "$(%s)" % inp
                    }]
                else:
                    component["script_parameters"][inp] = groupargs.args
                    slots[s] = slots[s][0:i] + ["$(%s)" % inp]
                task_foreach.append(inp)
                break
            if slots[s][i] == '\--':
                slots[s][i] = '--'

    if slots[0]:
        component["script_parameters"]["task.stdout"] = slots[0][0]
    if slots[1]:
        task_foreach.append("stdin")
        component["script_parameters"]["stdin"] = slots[1]
        component["script_parameters"]["task.stdin"] = "$(stdin)"

    if task_foreach:
        component["script_parameters"]["task.foreach"] = task_foreach

    component["script_parameters"]["command"] = slots[2:]
    if args.ignore_rcode:
        component["script_parameters"]["task.ignore_rcode"] = args.ignore_rcode

    pipeline = {
        "name": "arv-run " + " | ".join([s[0] for s in slots[2:]]),
        "description": "@" + " ".join(starting_args) + "@",
        "components": {
            "command": component
        },
        "state": "RunningOnClient" if args.local else "RunningOnServer"
    }

    if args.dry_run:
        print(json.dumps(pipeline, indent=4))
    else:
        pipeline["owner_uuid"] = project
        pi = api.pipeline_instances().create(
            body=pipeline, ensure_unique_name=True).execute()
        logger.info("Running pipeline %s", pi["uuid"])

        if args.local:
            subprocess.call([
                "arv-run-pipeline-instance", "--instance", pi["uuid"],
                "--run-jobs-here"
            ] + (["--no-reuse"] if args.no_reuse else []))
        elif not args.no_wait:
            ws.main(["--pipeline", pi["uuid"]])

        pi = api.pipeline_instances().get(uuid=pi["uuid"]).execute()
        logger.info("Pipeline is %s", pi["state"])
        if "output_uuid" in pi["components"]["command"]:
            logger.info("Output is %s",
                        pi["components"]["command"]["output_uuid"])
        else:
            logger.info("No output")
Example #59
0
def main(arguments=None):
    logger = logging.getLogger('arvados.arv-ws')

    parser = argparse.ArgumentParser()
    parser.add_argument('--version',
                        action='version',
                        version="%s %s" % (sys.argv[0], __version__),
                        help='Print version and exit.')
    parser.add_argument('-u',
                        '--uuid',
                        type=str,
                        default="",
                        help="Filter events on object_uuid")
    parser.add_argument(
        '-f',
        '--filters',
        type=str,
        default="",
        help="Arvados query filter to apply to log events (JSON encoded)")
    parser.add_argument(
        '-s',
        '--start-time',
        type=str,
        default="",
        help=
        "Arvados query filter to fetch log events created at or after this time. This will be server time in UTC. Allowed format: YYYY-MM-DD or YYYY-MM-DD hh:mm:ss"
    )
    parser.add_argument('-i',
                        '--id',
                        type=int,
                        default=None,
                        help="Start from given log id.")

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--poll-interval',
        default=15,
        type=int,
        help=
        "If websockets is not available, specify the polling interval, default is every 15 seconds"
    )
    group.add_argument(
        '--no-poll',
        action='store_false',
        dest='poll_interval',
        help="Do not poll if websockets are not available, just fail")

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '-p',
        '--pipeline',
        type=str,
        default="",
        help="Supply pipeline uuid, print log output from pipeline and its jobs"
    )
    group.add_argument('-j',
                       '--job',
                       type=str,
                       default="",
                       help="Supply job uuid, print log output from jobs")

    args = parser.parse_args(arguments)

    global filters
    global known_component_jobs
    global ws

    filters = []
    known_component_jobs = set()
    ws = None

    def update_subscribed_components(components):
        global known_component_jobs
        global filters
        pipeline_jobs = set()
        for c in components:
            if "job" in components[c]:
                pipeline_jobs.add(components[c]["job"]["uuid"])
        if known_component_jobs != pipeline_jobs:
            ws.unsubscribe(filters)
            filters = [[
                'object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)
            ]]
            ws.subscribe(
                [['object_uuid', 'in', [args.pipeline] + list(pipeline_jobs)]])
            known_component_jobs = pipeline_jobs

    api = arvados.api('v1')

    if args.uuid:
        filters += [['object_uuid', '=', args.uuid]]

    if args.filters:
        filters += json.loads(args.filters)

    if args.job:
        filters += [['object_uuid', '=', args.job]]

    if args.pipeline:
        filters += [['object_uuid', '=', args.pipeline]]

    if args.start_time:
        last_log_id = 1
        filters += [['created_at', '>=', args.start_time]]
    else:
        last_log_id = None

    if args.id:
        last_log_id = args.id - 1

    def on_message(ev):
        global filters
        global ws

        logger.debug(ev)
        if 'event_type' in ev and (args.pipeline or args.job):
            if ev['event_type'] in ('stderr', 'stdout'):
                sys.stdout.write(ev["properties"]["text"])
            elif ev["event_type"] in ("create", "update"):
                if ev["object_kind"] == "arvados#pipelineInstance":
                    update_subscribed_components(
                        ev["properties"]["new_attributes"]["components"])

                if ev["object_kind"] == "arvados#pipelineInstance" and args.pipeline:
                    if ev["properties"]["new_attributes"]["state"] in (
                            "Complete", "Failed", "Paused"):
                        ws.close()

                if ev["object_kind"] == "arvados#job" and args.job:
                    if ev["properties"]["new_attributes"]["state"] in (
                            "Complete", "Failed", "Cancelled"):
                        ws.close()
        elif 'status' in ev and ev['status'] == 200:
            pass
        else:
            print(json.dumps(ev))

    try:
        ws = subscribe(arvados.api('v1'),
                       filters,
                       on_message,
                       poll_fallback=args.poll_interval,
                       last_log_id=last_log_id)
        if ws:
            if args.pipeline:
                c = api.pipeline_instances().get(uuid=args.pipeline).execute()
                update_subscribed_components(c["components"])
                if c["state"] in ("Complete", "Failed", "Paused"):
                    ws.close()
            ws.run_forever()
    except KeyboardInterrupt:
        pass
    except Exception as e:
        logger.error(e)
    finally:
        if ws:
            ws.close()
    properties_options = ['Complete', 'Failed']
    event_type_options = ['update', 'create']
    try:
        if ev.get("event_type") in event_type_options and ev.get(
                "object_kind") in object_kind_options and ev["properties"][
                    "new_attributes"]["state"] in properties_options:
            print "%s %s %s" % (ev.get("object_kind"), ev["object_uuid"],
                                ev["properties"]["new_attributes"]["state"])
        if ev["properties"]["new_attributes"]["state"] == "Failed" or ev[
                "properties"]["new_attributes"]["state"] == "Complete":
            print "%s %s %s" % (ev.get("object_kind"), ev["object_uuid"],
                                ev["properties"]["new_attributes"]["state"])
            try:
                header = "%s %s %s %s" % (
                    ev.get("object_kind"), ev["object_uuid"],
                    ev["properties"]["old_attributes"]["name"],
                    ev["properties"]["new_attributes"]["state"])
            except:
                header = "%s %s %s" % (
                    ev.get("object_kind"), ev["object_uuid"],
                    ev["properties"]["new_attributes"]["state"])
            subprocess.check_call(
                ['/home/bcosc/gitrepos/arv-email/email-me.py', '-d', header])
    except:
        pass


api = arvados.api("v1")
ws = arvados.events.subscribe(api, [], on_message)
ws.run_forever()