Ejemplo n.º 1
0
    def setUp(self):
        super(FuseMagicTest, self).setUp()

        cw = arvados.CollectionWriter()

        cw.start_new_file('thing1.txt')
        cw.write("data 1")

        self.testcollection = cw.finish()
        self.api.collections().create(body={
            "manifest_text": cw.manifest_text()
        }).execute()
Ejemplo n.º 2
0
 def write_foo_bar_baz(self):
     cw = arvados.CollectionWriter(self.api_client)
     self.assertEqual(cw.current_stream_name(), '.',
                      'current_stream_name() should be "." now')
     cw.set_current_file_name('foo.txt')
     cw.write('foo')
     self.assertEqual(cw.current_file_name(), 'foo.txt',
                      'current_file_name() should be foo.txt now')
     cw.start_new_file('bar.txt')
     cw.write('bar')
     cw.start_new_stream('baz')
     cw.write('baz')
     cw.set_current_file_name('baz.txt')
     return cw.finish()
Ejemplo n.º 3
0
    def setUp(self, api=None):
        super(FuseMagicTest, self).setUp(api=api)

        cw = arvados.CollectionWriter()

        cw.start_new_file('thing1.txt')
        cw.write("data 1")

        self.testcollection = cw.finish()
        self.test_manifest = cw.manifest_text()
        coll = self.api.collections().create(
            body={
                "manifest_text": self.test_manifest
            }).execute()
        self.test_manifest_pdh = coll['portable_data_hash']
Ejemplo n.º 4
0
    def setUp(self, api=None):
        super(FuseMagicTestPDHOnly, self).setUp(api=api)

        cw = arvados.CollectionWriter()

        cw.start_new_file('thing1.txt')
        cw.write("data 1")

        self.testcollection = cw.finish()
        self.test_manifest = cw.manifest_text()
        created = self.api.collections().create(
            body={
                "manifest_text": self.test_manifest
            }).execute()
        self.testcollectionuuid = str(created['uuid'])
Ejemplo n.º 5
0
 def make_manifest(self,
                   bytes_per_block=1,
                   blocks_per_file=1,
                   files_per_stream=1,
                   streams=1):
     datablip = 'x' * bytes_per_block
     data_loc = tutil.str_keep_locator(datablip)
     with tutil.mock_keep_responses(data_loc, 200):
         coll = arvados.CollectionWriter()
         for si in range(0, streams):
             for fi in range(0, files_per_stream):
                 with coll.open("stream{}/file{}.txt".format(si, fi)) as f:
                     for bi in range(0, blocks_per_file):
                         f.write(datablip)
         return coll.manifest_text()
Ejemplo n.º 6
0
 def runTest(self):
     cw = arvados.CollectionWriter()
     self.assertEqual(cw.current_stream_name(), '.',
                      'current_stream_name() should be "." now')
     cw.set_current_file_name('foo.txt')
     cw.write('foo')
     self.assertEqual(cw.current_file_name(), 'foo.txt',
                      'current_file_name() should be foo.txt now')
     cw.start_new_file('bar.txt')
     cw.write('bar')
     cw.start_new_stream('baz')
     cw.write('baz')
     cw.set_current_file_name('baz.txt')
     hash = cw.finish()
     self.assertEqual(hash, '23ca013983d6239e98931cc779e68426+114',
                      'resulting manifest hash is not what I expected')
Ejemplo n.º 7
0
    def setUp(self, api=None):
        super(FuseMagicTest, self).setUp(api=api)

        self.test_project = run_test_server.fixture('groups')['aproject']['uuid']
        self.non_project_group = run_test_server.fixture('groups')['public']['uuid']
        self.collection_in_test_project = run_test_server.fixture('collections')['foo_collection_in_aproject']['name']

        cw = arvados.CollectionWriter()

        cw.start_new_file('thing1.txt')
        cw.write("data 1")

        self.testcollection = cw.finish()
        self.test_manifest = cw.manifest_text()
        coll = self.api.collections().create(body={"manifest_text":self.test_manifest}).execute()
        self.test_manifest_pdh = coll['portable_data_hash']
Ejemplo n.º 8
0
 def runTest(self):
     cw = arvados.CollectionWriter()
     self.assertEqual(cw.current_stream_name(), '.',
                      'current_stream_name() should be "." now')
     cw.set_current_file_name('foo.txt')
     cw.write('foo')
     self.assertEqual(cw.current_file_name(), 'foo.txt',
                      'current_file_name() should be foo.txt now')
     cw.start_new_file('bar.txt')
     cw.write('bar')
     cw.start_new_stream('baz')
     cw.write('baz')
     cw.set_current_file_name('baz.txt')
     hash = cw.finish()
     self.assertEqual(
         hash, 'd6c3b8e571f1b81ebb150a45ed06c884+114',
         "resulting manifest hash was {0}, expecting d6c3b8e571f1b81ebb150a45ed06c884+114"
         .format(hash))
Ejemplo n.º 9
0
    def setUp(self):
        super(FuseMountTest, self).setUp()

        cw = arvados.CollectionWriter()

        cw.start_new_file('thing1.txt')
        cw.write("data 1")
        cw.start_new_file('thing2.txt')
        cw.write("data 2")

        cw.start_new_stream('dir1')
        cw.start_new_file('thing3.txt')
        cw.write("data 3")
        cw.start_new_file('thing4.txt')
        cw.write("data 4")

        cw.start_new_stream('dir2')
        cw.start_new_file('thing5.txt')
        cw.write("data 5")
        cw.start_new_file('thing6.txt')
        cw.write("data 6")

        cw.start_new_stream('dir2/dir3')
        cw.start_new_file('thing7.txt')
        cw.write("data 7")

        cw.start_new_file('thing8.txt')
        cw.write("data 8")

        cw.start_new_stream('edgecases')
        for f in ":/.../-/*/\x01\\/ ".split("/"):
            cw.start_new_file(f)
            cw.write('x')

        for f in ":/.../-/*/\x01\\/ ".split("/"):
            cw.start_new_stream('edgecases/dirs/' + f)
            cw.start_new_file('x/x')
            cw.write('x')

        self.testcollection = cw.finish()
        self.api.collections().create(body={
            "manifest_text": cw.manifest_text()
        }).execute()
Ejemplo n.º 10
0
    def runTest(self):
        n_lines_in = 2**18
        data_in = "abc\n"
        for x in xrange(0, 18):
            data_in += data_in
        compressed_data_in = bz2.compress(data_in)
        cw = arvados.CollectionWriter()
        cw.start_new_file('test.bz2')
        cw.write(compressed_data_in)
        bz2_manifest = cw.manifest_text()

        cr = arvados.CollectionReader(bz2_manifest)
        got = 0
        for x in list(cr.all_files())[0].readlines():
            self.assertEqual(x, "abc\n",
                             "decompression returned wrong data: %s" % x)
            got += 1
        self.assertEqual(
            got, n_lines_in, "decompression returned %d lines instead of %d" %
            (got, n_lines_in))
Ejemplo n.º 11
0
def main():

    this_job = arvados.current_job()

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise FileAccessError("reference FASTA file not readable: %s" %
                              ref_file)
    # TODO: could check readability of .fai and .dict as well?

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list.txt$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    if not os.access(chunk_file, os.R_OK):
        raise FileAccessError("Chunk intervals file not readable: %s" %
                              chunk_file)

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise


#    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.vcf.gz")
    config_file = os.path.join(arvados.current_task().tmpdir, "mpileup.conf")
    lock_file = os.path.join(arvados.current_task().tmpdir,
                             "run-bt-mpileup.lock")

    if not os.path.exists(RUNNER_CONFIG_TEMPLATE):
        raise FileAccessError("No runner configuration template at %s" %
                              RUNNER_CONFIG_TEMPLATE)
    # generate config
    runner_config_text = jinja2.Environment(loader=jinja2.FileSystemLoader(
        "/")).get_template(RUNNER_CONFIG_TEMPLATE).render(
            fasta_reference=ref_file, input_cram=cram_file, regions=chunk_file)
    with open(config_file, "wb") as fh:
        fh.write(runner_config_text)
    # report configuration
    print "Generated runner config to %s:\n%s" % (config_file,
                                                  runner_config_text)

    # Call run-bt-mpileup
    runner_p = subprocess.Popen([
        "run-bt-mpileup", "+config", config_file, "+js", "mpm", "+loop", "5",
        "+lock", lock_file, "-o", out_dir
    ],
                                stdin=None,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                close_fds=True,
                                shell=False)

    while runner_p.poll() is None:
        line = runner_p.stdout.readline()
        # only print '#### unfinished' lines or things that are errors or warnings
        if re.search(r'\d+\s+unfinished', line) or re.search(
                r'(FATAL|ERROR|WARNING)', line, flags=re.IGNORECASE):
            print "RUNNER: %s" % line.rstrip()

    runner_exit = runner_p.wait()
    if runner_exit != 0:
        print "WARNING: runner exited with exit code %s" % runner_exit

    # clean up out_dir
    try:
        os.remove(os.path.join(out_dir, "run-bt-mpileup.lock"))
        os.remove(os.path.join(out_dir, "mpileup.conf"))
        os.remove(os.path.join(out_dir, "cleaned-job-outputs.tgz"))
    except:
        print "WARNING: could not remove some output files!"
        pass

    out_bcf = os.path.join(
        out_dir,
        os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) +
        ".bcf")
    try:
        os.rename(os.path.join(out_dir, "all.bcf"), out_bcf)
        os.rename(os.path.join(out_dir, "all.bcf.csi"), out_bcf + ".csi")
        os.rename(os.path.join(out_dir, "all.bcf.filt.vchk"),
                  out_bcf + ".filt.vchk")
        os.rename(os.path.join(out_dir, "all.bcf.vchk"), out_bcf + ".vchk")
    except:
        print "WARNING: could not rename some output files!"
        pass

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)
Ejemplo n.º 12
0
def main(arguments=None):
    args = arvrun_parser.parse_args(arguments)

    if len(args.args) == 0:
        arvrun_parser.print_help()
        return

    starting_args = args.args

    reading_into = 2

    # Parse the command arguments into 'slots'.
    # All words following '>' are output arguments and are collected into slots[0].
    # All words following '<' are input arguments and are collected into slots[1].
    # slots[2..] store the parameters of each command in the pipeline.
    #
    # e.g. arv-run foo arg1 arg2 '|' bar arg3 arg4 '<' input1 input2 input3 '>' output.txt
    # will be parsed into:
    #   [['output.txt'],
    #    ['input1', 'input2', 'input3'],
    #    ['foo', 'arg1', 'arg2'],
    #    ['bar', 'arg3', 'arg4']]
    slots = [[], [], []]
    for c in args.args:
        if c.startswith('>'):
            reading_into = 0
            if len(c) > 1:
                slots[reading_into].append(c[1:])
        elif c.startswith('<'):
            reading_into = 1
            if len(c) > 1:
                slots[reading_into].append(c[1:])
        elif c == '|':
            reading_into = len(slots)
            slots.append([])
        else:
            slots[reading_into].append(c)

    if slots[0] and len(slots[0]) > 1:
        logger.error("Can only specify a single stdout file (run-command substitutions are permitted)")
        return

    if not args.dry_run:
        api = arvados.api('v1')
        if args.project_uuid:
            project = args.project_uuid
        else:
            project = determine_project(os.getcwd(), api.users().current().execute()["uuid"])

    # Identify input files.  Look at each parameter and test to see if there is
    # a file by that name.  This uses 'patterns' to look for within
    # command line arguments, such as --foo=file.txt or -lfile.txt
    patterns = [re.compile("([^=]+=)(.*)"),
                re.compile("(-[A-Za-z])(.+)")]
    for j, command in enumerate(slots[1:]):
        for i, a in enumerate(command):
            if j > 0 and i == 0:
                # j == 0 is stdin, j > 0 is commands
                # always skip program executable (i == 0) in commands
                pass
            elif a.startswith('\\'):
                # if it starts with a \ then don't do any interpretation
                command[i] = a[1:]
            else:
                # See if it looks like a file
                command[i] = statfile('', a)

                # If a file named command[i] was found, it would now be an
                # ArvFile or UploadFile.  If command[i] is a basestring, that
                # means it doesn't correspond exactly to a file, so do some
                # pattern matching.
                if isinstance(command[i], basestring):
                    for p in patterns:
                        m = p.match(a)
                        if m:
                            command[i] = statfile(m.group(1), m.group(2))
                            break

    n = True
    pathprefix = "/"
    files = [c for command in slots[1:] for c in command if isinstance(c, UploadFile)]
    if len(files) > 0:
        # Find the smallest path prefix that includes all the files that need to be uploaded.
        # This starts at the root and iteratively removes common parent directory prefixes
        # until all file pathes no longer have a common parent.
        while n:
            pathstep = None
            for c in files:
                if pathstep is None:
                    sp = c.fn.split('/')
                    if len(sp) < 2:
                        # no parent directories left
                        n = False
                        break
                    # path step takes next directory
                    pathstep = sp[0] + "/"
                else:
                    # check if pathstep is common prefix for all files
                    if not c.fn.startswith(pathstep):
                        n = False
                        break
            if n:
                # pathstep is common parent directory for all files, so remove the prefix
                # from each path
                pathprefix += pathstep
                for c in files:
                    c.fn = c.fn[len(pathstep):]

        orgdir = os.getcwd()
        os.chdir(pathprefix)

        print("Upload local files: \"%s\"" % '" "'.join([c.fn for c in files]))

        if args.dry_run:
            print("$(input) is %s" % pathprefix.rstrip('/'))
            pdh = "$(input)"
        else:
            files = sorted(files, key=lambda x: x.fn)
            collection = arvados.CollectionWriter(api, num_retries=args.retries)
            stream = None
            for f in files:
                sp = os.path.split(f.fn)
                if sp[0] != stream:
                    stream = sp[0]
                    collection.start_new_stream(stream)
                collection.write_file(f.fn, sp[1])
            item = api.collections().create(body={"owner_uuid": project, "manifest_text": collection.manifest_text()}).execute()
            pdh = item["portable_data_hash"]
            print "Uploaded to %s" % item["uuid"]

        for c in files:
            c.fn = "$(file %s/%s)" % (pdh, c.fn)

        os.chdir(orgdir)

    for i in xrange(1, len(slots)):
        slots[i] = [("%s%s" % (c.prefix, c.fn)) if isinstance(c, ArvFile) else c for c in slots[i]]

    component = {
        "script": "run-command",
        "script_version": args.script_version,
        "repository": args.repository,
        "script_parameters": {
        },
        "runtime_constraints": {
            "docker_image": args.docker_image
        }
    }

    task_foreach = []
    group_parser = argparse.ArgumentParser()
    group_parser.add_argument('-b', '--batch-size', type=int)
    group_parser.add_argument('args', nargs=argparse.REMAINDER)

    for s in xrange(2, len(slots)):
        for i in xrange(0, len(slots[s])):
            if slots[s][i] == '--':
                inp = "input%i" % (s-2)
                groupargs = group_parser.parse_args(slots[2][i+1:])
                if groupargs.batch_size:
                    component["script_parameters"][inp] = {"value": {"batch":groupargs.args, "size":groupargs.batch_size}}
                    slots[s] = slots[s][0:i] + [{"foreach": inp, "command": "$(%s)" % inp}]
                else:
                    component["script_parameters"][inp] = groupargs.args
                    slots[s] = slots[s][0:i] + ["$(%s)" % inp]
                task_foreach.append(inp)
                break
            if slots[s][i] == '\--':
                slots[s][i] = '--'

    if slots[0]:
        component["script_parameters"]["task.stdout"] = slots[0][0]
    if slots[1]:
        task_foreach.append("stdin")
        component["script_parameters"]["stdin"] = slots[1]
        component["script_parameters"]["task.stdin"] = "$(stdin)"

    if task_foreach:
        component["script_parameters"]["task.foreach"] = task_foreach

    component["script_parameters"]["command"] = slots[2:]
    if args.ignore_rcode:
        component["script_parameters"]["task.ignore_rcode"] = args.ignore_rcode

    pipeline = {
        "name": "arv-run " + " | ".join([s[0] for s in slots[2:]]),
        "description": "@" + " ".join(starting_args) + "@",
        "components": {
            "command": component
        },
        "state": "RunningOnClient" if args.local else "RunningOnServer"
    }

    if args.dry_run:
        print(json.dumps(pipeline, indent=4))
    else:
        pipeline["owner_uuid"] = project
        pi = api.pipeline_instances().create(body=pipeline, ensure_unique_name=True).execute()
        print "Running pipeline %s" % pi["uuid"]

        if args.local:
            subprocess.call(["arv-run-pipeline-instance", "--instance", pi["uuid"], "--run-jobs-here"] + (["--no-reuse"] if args.no_reuse else []))
        elif not args.no_wait:
            ws.main(["--pipeline", pi["uuid"]])

        pi = api.pipeline_instances().get(uuid=pi["uuid"]).execute()
        print "Pipeline is %s" % pi["state"]
        if "output_uuid" in pi["components"]["command"]:
            print "Output is %s" % pi["components"]["command"]["output_uuid"]
        else:
            print "No output"
def main():
    signal(SIGINT, sigint_handler)
    signal(SIGTERM, sigterm_handler)

    this_job = arvados.current_job()

    skip_sq_sn_regex = this_job['script_parameters']['skip_sq_sn_regex']

    genome_chunks = int(this_job['script_parameters']['genome_chunks'])
    if genome_chunks < 1:
        raise InvalidArgumentError("genome_chunks must be a positive integer")

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0,
                           and_end_task=True,
                           skip_sq_sn_regex=skip_sq_sn_regex,
                           genome_chunks=genome_chunks)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError(
            "No reference fasta found in reference collection.")

    # Ensure we can read the reference fasta
    test_and_prime_input_file(
        ref_file,
        error_exception=FileAccessError("reference fasta not readable: %s" %
                                        ref_file))

    # Ensure we have corresponding .fai, and that it is also readable
    ref_fai_file = ref_file + ".fai"
    test_and_prime_input_file(
        ref_fai_file,
        error_exception=FileAccessError(
            "reference fai index not readable: %s" % ref_fai_file))

    # Get genome chunk intervals file
    chunk_file = None
    print "Mounting chunk collection"
    chunk_dir = arvados.get_task_param_mount('chunk')

    for f in arvados.util.listdir_recursive(chunk_dir):
        if re.search(r'\.region_list$', f):
            chunk_file = os.path.join(chunk_dir, f)
    if chunk_file is None:
        raise InvalidArgumentError(
            "No chunk intervals file found in chunk collection.")
    # Ensure we can read the chunk file
    test_and_prime_input_file(
        chunk_file,
        error_exception=FileAccessError(
            "Chunk intervals file not readable: %s" % chunk_file))

    # Get single CRAM file for this task
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    stream_name = ""
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    test_and_prime_input_file(cram_file,
                              error_exception=FileAccessError(
                                  "CRAM file not readable: %s" % cram_file))

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert (cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not test_and_prime_input_file(crai_file, error_exception=None):
        crai_file = cram_file_base + ".cram.crai"
        if not test_and_prime_input_file(crai_file, error_exception=None):
            raise FileAccessError(
                "No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    tmp_dir = arvados.current_task().tmpdir
    out_dir = os.path.join(tmp_dir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir,
                                                            old_out_dir)
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
    output_basename = os.path.basename(
        cram_file_base) + "." + os.path.basename(chunk_file)
    out_file_tmp = os.path.join(tmp_dir, output_basename + ".g.vcf.tmp")
    penultimate_out_file = os.path.join(
        tmp_dir, output_basename + ".provheader.g.vcf.gz")
    final_out_file = os.path.join(out_dir, output_basename + ".g.vcf.gz")

    #    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)
    regions = []
    print "Preparing region list from chunk file [%s]" % chunk_file
    with open(chunk_file, 'r') as f:
        for line in f.readlines():
            (chr, start, end) = line.rstrip().split()
            region = "%s:%s-%s" % (chr, start, end)
            regions.append(region)
    total_region_count = len(regions)

    print "Preparing fifos for output from %s bcftools mpileup commands (one for each region) to bcftools concat" % total_region_count

    concat_noheader_fifos = dict()
    concat_headeronly_tmps = dict()
    current_region_num = 0
    for region in regions:
        current_region_num += 1
        concat_noheader_fifo = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.g.vcf" %
                               (current_region_num, total_region_count)))
        try:
            os.mkfifo(concat_noheader_fifo, 0600)
        except:
            print "ERROR: could not mkfifo %s" % concat_noheader_fifo
            raise
        fifos_to_delete.append(concat_noheader_fifo)
        concat_noheader_fifos[region] = concat_noheader_fifo
        concat_headeronly_tmp = os.path.join(
            tmp_dir,
            output_basename + (".part_%s_of_%s.headeronly.g.vcf.gz" %
                               (current_region_num, total_region_count)))
        concat_headeronly_tmps[region] = concat_headeronly_tmp

    region_concat_cmd = ["cat"]
    region_concat_cmd.extend(
        [concat_noheader_fifos[region] for region in regions])

    # open file for output file
    out_file_tmp_f = open(out_file_tmp, 'wb')

    region_concat_p = run_child_cmd(region_concat_cmd,
                                    stdout=out_file_tmp_f,
                                    tag="bcftools concat (stderr)")

    current_region_num = 0
    current_concat_noheader_fifo_f = None
    regions_to_process = list(regions)
    bcftools_mpileup_p = None
    bcftools_norm_p = None
    part_tee_p = None
    bcftools_view_headeronly_p = None
    bcftools_view_noheader_p = None
    while True:
        # at least one of the regional aggregation processes is still running

        watch_fds_and_print_output()

        if ((bcftools_mpileup_p is None) and (bcftools_norm_p is None)
                and (part_tee_p is None)
                and (bcftools_view_headeronly_p is None)
                and (bcftools_view_noheader_p is None)):
            # no per-region processes are running (they have finished or
            # have not yet started)
            if len(regions_to_process) > 0:
                # have more regions to run
                region = regions_to_process.pop(0)
                current_region_num += 1
                region_label = "%s/%s [%s]" % (current_region_num,
                                               total_region_count, region)
                concat_noheader_fifo = concat_noheader_fifos[region]
                bcftools_view_noheader_input_fifo = os.path.join(
                    tmp_dir, output_basename +
                    (".part_%s_of_%s.noheader.g.bcf" %
                     (current_region_num, total_region_count)))
                part_tee_cmd = [
                    "teepot", bcftools_view_noheader_input_fifo, "-"
                ]
                bcftools_view_noheader_cmd = [
                    "bcftools", "view", "-H", "-Ov",
                    bcftools_view_noheader_input_fifo
                ]
                concat_headeronly_tmp = concat_headeronly_tmps[region]
                bcftools_view_headeronly_cmd = [
                    "bcftools", "view", "-h", "-Oz", "-o",
                    concat_headeronly_tmp
                ]
                bcftools_norm_cmd = ["bcftools", "norm", "-f", ref_file, "-Ou"]
                bcftools_mpileup_cmd = [
                    "bcftools-gvcf", "mpileup", "-t", "AD,INFO/AD", "-C50",
                    "-pm2", "-F0.1", "-d10000", "--gvcf", "1,2,3,4,5,10,15",
                    "-f", ref_file, "-Ou", "-r", region, cram_file
                ]

                print "Creating 'bcftools mpileup | bcftools norm' pipe for region %s" % (
                    region_label)
                bcftools_norm_stdin_pipe_read, bcftools_norm_stdin_pipe_write = os.pipe(
                )

                print "Creating 'bcftools norm | tee' pipe for region %s" % (
                    region_label)
                part_tee_stdin_pipe_read, part_tee_stdin_pipe_write = os.pipe()

                print "Creating 'tee | bcftools view -h' pipe for region %s" % (
                    region_label)
                bcftools_view_headeronly_stdin_pipe_read, bcftools_view_headeronly_stdin_pipe_write = os.pipe(
                )

                print "Creating 'tee | bcftools view' named pipe [%s] for region %s" % (
                    bcftools_view_noheader_input_fifo, region_label)
                try:
                    os.mkfifo(bcftools_view_noheader_input_fifo, 0600)
                except:
                    print "ERROR: could not mkfifo %s" % bcftools_view_noheader_input_fifo
                    raise
                fifos_to_delete.append(bcftools_view_noheader_input_fifo)

                print "Opening concat fifo %s for writing" % concat_noheader_fifo
                if current_concat_noheader_fifo_f is not None:
                    #print "ERROR: current_concat_noheader_fifo_f was not closed properly"
                    #raise Exception("current_concat_noheader_fifo_f was not closed properly")
                    current_concat_noheader_fifo_f.close()
                current_concat_noheader_fifo_f = open(concat_noheader_fifo,
                                                      'wb')

                bcftools_mpileup_p = run_child_cmd(
                    bcftools_mpileup_cmd,
                    stdout=bcftools_norm_stdin_pipe_write,
                    tag="bcftools mpileup %s" % (region_label))

                bcftools_norm_p = run_child_cmd(
                    bcftools_norm_cmd,
                    stdin=bcftools_norm_stdin_pipe_read,
                    stdout=part_tee_stdin_pipe_write,
                    tag="bcftools norm %s" % (region_label))

                part_tee_p = run_child_cmd(
                    part_tee_cmd,
                    stdin=part_tee_stdin_pipe_read,
                    stdout=bcftools_view_headeronly_stdin_pipe_write,
                    tag="tee %s" % (region_label))

                bcftools_view_headeronly_p = run_child_cmd(
                    bcftools_view_headeronly_cmd,
                    stdin=bcftools_view_headeronly_stdin_pipe_read,
                    tag="bcftools view -h %s" % (region_label))

                bcftools_view_noheader_p = run_child_cmd(
                    bcftools_view_noheader_cmd,
                    stdout=current_concat_noheader_fifo_f,
                    tag="bcftools view %s" % (region_label))

        bcftools_mpileup_p = close_process_if_finished(
            bcftools_mpileup_p,
            "bcftools mpileup %s" % (region_label),
            close_fds=[bcftools_norm_stdin_pipe_write])

        bcftools_norm_p = close_process_if_finished(
            bcftools_norm_p,
            "bcftools norm %s" % (region_label),
            close_fds=[
                bcftools_norm_stdin_pipe_read, part_tee_stdin_pipe_write
            ])

        part_tee_p = close_process_if_finished(
            part_tee_p,
            "tee %s" % (region_label),
            close_fds=[
                part_tee_stdin_pipe_read,
                bcftools_view_headeronly_stdin_pipe_write
            ],
            ignore_error=True)

        bcftools_view_headeronly_p = close_process_if_finished(
            bcftools_view_headeronly_p,
            "bcftools view -h %s" % (region_label),
            close_fds=[bcftools_view_headeronly_stdin_pipe_read])

        bcftools_view_noheader_p = close_process_if_finished(
            bcftools_view_noheader_p,
            "bcftools view %s" % (region_label),
            close_files=[current_concat_noheader_fifo_f])

        region_concat_p = close_process_if_finished(
            region_concat_p, "bcftools concat", close_files=[out_file_tmp_f])

        # end loop once all processes have finished
        if ((region_concat_p is None) and (bcftools_view_noheader_p is None)
                and (bcftools_view_headeronly_p is None)
                and (part_tee_p is None) and (bcftools_norm_p is None)
                and (bcftools_mpileup_p is None)):
            print "All region work has completed"
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if len(child_pids) > 0:
        print "WARNING: some children are still alive: [%s]" % (child_pids)
        for pid in child_pids:
            print "Attempting to terminate %s forcefully" % (pid)
            try:
                os.kill(pid, SIGTERM)
            except Exception as e:
                print "Could not kill pid %s: %s" % (pid, e)

    for fifo in fifos_to_delete:
        try:
            os.remove(fifo)
        except:
            raise

    concat_headeronly_tmp_fofn = os.path.join(tmp_dir,
                                              output_basename + ".fifos_fofn")
    tmp_files_to_delete = []
    print "Preparing fofn for bcftools concat (headeronly): %s" % (
        concat_headeronly_tmp_fofn)
    with open(concat_headeronly_tmp_fofn, 'w') as f:
        print "Checking files for regions: %s" % regions
        for concat_headeronly_tmp in [
                concat_headeronly_tmps[region] for region in regions
        ]:
            if os.path.exists(concat_headeronly_tmp):
                print "Adding %s to fofn" % concat_headeronly_tmp
                f.write("%s\n" % concat_headeronly_tmp)
                tmp_files_to_delete.append(concat_headeronly_tmp)
            else:
                print "WARNING: no output file for %s (there was probably no data in the region)" % concat_headeronly_tmp

    final_headeronly_tmp = os.path.join(tmp_dir,
                                        output_basename + ".headeronly.g.vcf")
    final_headeronly_tmp_f = open(final_headeronly_tmp, 'wb')

    print "Creating 'bcftools concat | grep' pipe"
    grep_headeronly_stdin_pipe_read, grep_headeronly_stdin_pipe_write = os.pipe(
    )

    grep_headeronly_cmd = [
        "egrep", "-v", "^[#][#](bcftools|mpileup|reference)"
    ]
    grep_headeronly_p = run_child_cmd(grep_headeronly_cmd,
                                      stdin=grep_headeronly_stdin_pipe_read,
                                      stdout=final_headeronly_tmp_f,
                                      tag="grep (headeronly)")
    bcftools_concat_headeronly_cmd = [
        "bcftools", "concat", "-Ov", "-f", concat_headeronly_tmp_fofn
    ]
    bcftools_concat_headeronly_p = run_child_cmd(
        bcftools_concat_headeronly_cmd,
        stdout=grep_headeronly_stdin_pipe_write,
        tag="bcftools concat (headeronly)")
    while True:
        watch_fds_and_print_output()
        bcftools_concat_headeronly_p = close_process_if_finished(
            bcftools_concat_headeronly_p,
            "bcftools concat (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_write])
        grep_headeronly_p = close_process_if_finished(
            grep_headeronly_p,
            "grep (headeronly)",
            close_fds=[grep_headeronly_stdin_pipe_read],
            close_files=[final_headeronly_tmp_f])
        if ((bcftools_concat_headeronly_p is None)
                and (grep_headeronly_p is None)):
            # none of the processes are still running, we're done!
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_concat_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate bcftools concat (headeronly)"

    if grep_headeronly_p is not None:
        print "ERROR: failed to cleanly terminate grep (headeronly)"

    # check if there was any data output
    if os.stat(out_file_tmp)[6] == 0:
        # 0-byte data file, there is no point in concatenating and
        # reheader will reject the file, so we need to bgzip it ourselves
        print "Handling 0-byte data file - compressing headeronly vcf with bgzip to create [%s]" % (
            final_out_file)
        final_out_file_f = open(final_out_file, 'wb')
        final_bgzip_cmd = ["bgzip", "-c", final_headeronly_tmp]
        final_bgzip_p = run_child_cmd(final_bgzip_cmd,
                                      tag="final bgzip",
                                      stdout=final_out_file_f)
        while True:
            watch_fds_and_print_output()
            final_bgzip_p = close_process_if_finished(
                final_bgzip_p, "final bgzip", close_files=[final_out_file_f])
            if (final_bgzip_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bgzip_p is not None:
            print "ERROR: failed to cleanly terminate final bgzip (header with no data)"
    else:
        # there is some data in the data file
        print "Creating final 'cat | bcftools view -Oz' pipe"
        final_bcftools_view_stdin_pipe_read, final_bcftools_view_stdin_pipe_write = os.pipe(
        )
        print "Preparing penultimate output file [%s]" % (penultimate_out_file)
        final_bcftools_view_cmd = [
            "bcftools", "view", "-Oz", "-o", penultimate_out_file
        ]
        final_concat_cmd = ["cat", final_headeronly_tmp, out_file_tmp]
        final_bcftools_view_p = run_child_cmd(
            final_bcftools_view_cmd,
            tag="final bcftools view -Oz",
            stdin=final_bcftools_view_stdin_pipe_read)
        final_concat_p = run_child_cmd(
            final_concat_cmd,
            tag="final cat (header+data)",
            stdout=final_bcftools_view_stdin_pipe_write)
        while True:
            watch_fds_and_print_output()
            final_bcftools_view_p = close_process_if_finished(
                final_bcftools_view_p,
                "final bcftools view -Oz",
                close_fds=[final_bcftools_view_stdin_pipe_read])
            final_concat_p = close_process_if_finished(
                final_concat_p,
                "final cat (header+data)",
                close_fds=[final_bcftools_view_stdin_pipe_write])
            if ((final_concat_p is None) and (final_bcftools_view_p is None)):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration

        if final_bcftools_view_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools view -Oz"

        if final_concat_p is not None:
            print "ERROR: failed to cleanly terminate final cat (header+data)"

        print "Reheadering penultimate output file into final out file [%s]" % (
            final_out_file)
        final_bcftools_reheader_cmd = [
            "bcftools", "reheader", "-h", final_headeronly_tmp, "-o",
            final_out_file, penultimate_out_file
        ]
        final_bcftools_reheader_p = run_child_cmd(
            final_bcftools_reheader_cmd, tag="final bcftools reheader")
        while True:
            watch_fds_and_print_output()
            final_bcftools_reheader_p = close_process_if_finished(
                final_bcftools_reheader_p, "final bcftools reheader")
            if (final_bcftools_reheader_p is None):
                # none of the processes are still running, we're done!
                break
            else:
                sleep(0.01)
                # continue to next loop iteration
        if final_bcftools_reheader_p is not None:
            print "ERROR: failed to cleanly terminate final bcftools reheader"
        os.remove(penultimate_out_file)

    print "Indexing final output file [%s]" % (final_out_file)
    bcftools_index_cmd = ["bcftools", "index", final_out_file]
    bcftools_index_p = run_child_cmd(bcftools_index_cmd, tag="bcftools index")
    while True:
        watch_fds_and_print_output()
        bcftools_index_p = close_process_if_finished(bcftools_index_p,
                                                     "bcftools index")
        if (bcftools_index_p is None):
            break
        else:
            sleep(0.01)
            # continue to next loop iteration

    if bcftools_index_p is not None:
        print "ERROR: failed to cleanly terminate bcftools index"

    print "Complete, removing temporary files"
    os.remove(concat_headeronly_tmp_fofn)
    os.remove(out_file_tmp)
    os.remove(final_headeronly_tmp)
    for tmp_file in tmp_files_to_delete:
        os.remove(tmp_file)

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    print "Writing Keep Collection from [%s] to [%s]" % (out_dir, stream_name)
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()
    print "Task output locator [%s]" % output_locator

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)

    # Done!
    print "Task complete!"
Ejemplo n.º 14
0
    of.write("directory structure:\n" + lsinfo)

    dfinfo = sp.check_output(["df", "-h"])
    of.write("df:\n" + dfinfo)

    meminfo = sp.check_output(["free", "-hm"])
    of.write("mem:\n" + meminfo)

    hostinfo = sp.check_output(["hostname"])
    of.write("host: " + hostinfo)


job = arv.current_job()
task = arv.current_task()

of = arv.CollectionWriter()
of.set_current_file_name("info.log")

whoinfo = sp.check_output(["whoami"])
of.write("user: "******"\n")

pwdinfo = sp.check_output(["pwd"])
of.write("pwd: " + pwdinfo + "\n")

lsinfo = sp.check_output(["ls", "-lahR"])
of.write("directory structure:\n" + lsinfo)

dfinfo = sp.check_output(["df", "-h"])
of.write("df:\n" + dfinfo + "\n")

meminfo = sp.check_output(["free", "-hm"])
Ejemplo n.º 15
0
                    help="List of keys to use for each TSV file",
                    action="append")
args = parser.parse_args()

ds, ds_path = find_subjects_neuro_data_lite.install_dataset(
    args.dataset_name, args.output_dir)
_ign, complete_subjects, _ign, _ign = find_subjects_behavior_data.get_data(
    args.behavior_files, args.behavior_keys)
complete_subjects, datatypes = find_subjects_neuro_data_lite.get_type_neuro_data(
    args.output_dir, subjects=complete_subjects, add_unknown_subjects=False)

if args.get_data:
    subject_pdh_uuid = []
    subjects = sorted([s for s in complete_subjects])
    for subject in subjects:
        out = arvados.CollectionWriter()
        subject_get_data_path = os.path.join(ds_path, subject)
        find_subjects_neuro_data_lite.get_dataset_data(
            ds, subject_get_data_path, parallelized=args.ncores)
        print("Writing %s to new collection" % (subject_get_data_path))
        out.write_directory_tree(subject_get_data_path)
        out.finish()
        print("Written Collection: \n\tPDH: %s\n\n" %
              (out.portable_data_hash()))
        collection_body = {
            "collection": {
                "name": "%s MPI-Leipzig data" % (subject),
                "owner_uuid": args.project_uuid,
                "portable_data_hash": out.portable_data_hash(),
                "manifest_text": out.manifest_text()
            }
Ejemplo n.º 16
0
def uploadfiles(files,
                api,
                dry_run=False,
                num_retries=0,
                project=None,
                fnPattern="$(file %s/%s)",
                name=None):
    # Find the smallest path prefix that includes all the files that need to be uploaded.
    # This starts at the root and iteratively removes common parent directory prefixes
    # until all file paths no longer have a common parent.
    n = True
    pathprefix = "/"
    while n:
        pathstep = None
        for c in files:
            if pathstep is None:
                sp = c.fn.split('/')
                if len(sp) < 2:
                    # no parent directories left
                    n = False
                    break
                # path step takes next directory
                pathstep = sp[0] + "/"
            else:
                # check if pathstep is common prefix for all files
                if not c.fn.startswith(pathstep):
                    n = False
                    break
        if n:
            # pathstep is common parent directory for all files, so remove the prefix
            # from each path
            pathprefix += pathstep
            for c in files:
                c.fn = c.fn[len(pathstep):]

    orgdir = os.getcwd()
    os.chdir(pathprefix)

    logger.info("Upload local files: \"%s\"",
                '" "'.join([c.fn for c in files]))

    if dry_run:
        logger.info("$(input) is %s", pathprefix.rstrip('/'))
        pdh = "$(input)"
    else:
        files = sorted(files, key=lambda x: x.fn)
        collection = arvados.CollectionWriter(api, num_retries=num_retries)
        stream = None
        for f in files:
            sp = os.path.split(f.fn)
            if sp[0] != stream:
                stream = sp[0]
                collection.start_new_stream(stream)
            collection.write_file(f.fn, sp[1])

        exists = api.collections().list(filters=[[
            "owner_uuid", "=", project
        ], ["portable_data_hash", "=",
            collection.portable_data_hash()], ["name", "=", name]]).execute(
                num_retries=num_retries)
        if exists["items"]:
            item = exists["items"][0]
            logger.info("Using collection %s", item["uuid"])
        else:
            body = {
                "owner_uuid": project,
                "manifest_text": collection.manifest_text()
            }
            if name is not None:
                body["name"] = name
            item = api.collections().create(body=body,
                                            ensure_unique_name=True).execute()
            logger.info("Uploaded to %s", item["uuid"])

        pdh = item["portable_data_hash"]

    for c in files:
        c.keepref = "%s/%s" % (pdh, c.fn)
        c.fn = fnPattern % (pdh, c.fn)

    os.chdir(orgdir)
Ejemplo n.º 17
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.one_task_per_group_and_per_n_gvcfs(ref_input_pdh,
                                                   job_input_pdh,
                                                   interval_lists_pdh,
                                                   group_by_regex,
                                                   max_gvcfs_to_combine,
                                                   if_sequence=0,
                                                   and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase II: Read interval_list and split into additional intervals
    ################################################################################
    hgi_arvados.one_task_per_interval(
        interval_count,
        validate_task_output,
        reuse_tasks=True,
        oldest_git_commit_to_reuse="1f6e1e0b8bb12c573dd253d7900ef55305d55aa1",
        if_sequence=1,
        and_end_task=True)

    # We will never reach this point if we are in the 1st task sequence
    assert (this_task['sequence'] > 1)

    ################################################################################
    # Phase IIIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIIb: Combine gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    interval_str = this_task['parameters'].get('interval')
    if not interval_str:
        interval_str = ""
    interval_strs = interval_str.split()
    intervals = []
    for interval in interval_strs:
        intervals.extend(["--intervals", interval])
    out_file = name + ".vcf.gz"
    if interval_count > 1:
        out_file = name + "." + '_'.join(interval_strs) + ".vcf.gz"
        if len(out_file) > 255:
            out_file = name + "." + '_'.join(
                [interval_strs[0], interval_strs[-1]]) + ".vcf.gz"
            print "Output file name was too long with full interval list, shortened it to: %s" % out_file
        if len(out_file) > 255:
            raise errors.InvalidArgumentError(
                "Output file name is too long, cannot continue: %s" % out_file)

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # CombineGVCFs!
    extra_args = intervals
    extra_args.extend(["--breakBandsAtMultiplesOf", "1000000"])
    gatk_exit = gatk.combine_gvcfs(ref_file,
                                   gvcf_files,
                                   os.path.join(out_dir, out_file),
                                   extra_gatk_args=extra_args)

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
Ejemplo n.º 18
0
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    if arvados.current_task()['sequence'] == 0:
        # get candidates for task reuse
        task_key_params = [
            'inputs', 'ref', 'name'
        ]  # N.B. inputs collection includes input vcfs and corresponding interval_list
        script = "gatk-genotypegvcfs.py"
        oldest_git_commit_to_reuse = '6ca726fc265f9e55765bf1fdf71b86285b8a0ff2'
        job_filters = [
            ['script', '=', script],
            ['repository', '=',
             arvados.current_job()['repository']],
            ['script_version', 'in git', oldest_git_commit_to_reuse],
            [
                'docker_image_locator', 'in docker',
                arvados.current_job()['docker_image_locator']
            ],
        ]

        # retrieve a full set of all possible reusable tasks at sequence 1
        print "Retrieving all potentially reusable tasks"
        reusable_tasks = hgi_arvados.get_reusable_tasks(
            1, task_key_params, job_filters)
        print "Have %s tasks for potential reuse" % (len(reusable_tasks))

        def create_task_with_validated_reuse(sequence, params):
            return hgi_arvados.create_or_reuse_task(sequence, params,
                                                    reusable_tasks,
                                                    task_key_params,
                                                    validate_task_output)

        # Setup sub tasks (and terminate if this is task 0)
        hgi_arvados.one_task_per_group_combined_inputs(
            ref_input_pdh,
            job_input_pdh,
            interval_lists_pdh,
            group_by_regex,
            if_sequence=0,
            and_end_task=True,
            create_task_func=create_task_with_validated_reuse)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task sequence
    assert (this_task['sequence'] > 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Genotype gVCFs!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    gvcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param="inputs")
    name = this_task['parameters'].get('name')
    if not name:
        name = "unknown"
    out_file = name + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_file = out_file.replace(".bcf", "._cf")

    # GenotypeGVCFs!
    gatk_exit = gatk.genotype_gvcfs(ref_file,
                                    interval_list_file,
                                    gvcf_files,
                                    os.path.join(out_dir, out_file),
                                    cores="4",
                                    java_mem="19g")

    if gatk_exit != 0:
        print "WARNING: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
Ejemplo n.º 19
0
            CURR_TILEVARS[tile_int].append(len(TILEVARS_TO_WRITE))
        ignore, phaseA, phaseB, HG19_GENOME_VARIANTS, GENOME_VARIANT_ID, var_to_append = retval
        HUMAN_SEQ_PHASEA.extend(phaseA)
        HUMAN_SEQ_PHASEB.extend(phaseB)
        TILEVARS_TO_WRITE.append(var_to_append)
    return True

########################################################################################################################
#Set-up files to write out to

GENOME_VARIANT_FILE = 'genomevariant.csv'
GENOME_TRANSLATION_FILE = 'genomevarianttranslation.csv'
NEW_TILEVARIANT_FILE = 'tilevariant.csv'

# Write a new collection as output
out = arvados.CollectionWriter(num_retries=NUM_RETRIES)

#Parallelize the job according to paths and paths only => use library collection as the main input!
arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True)

# Get object representing the current task
this_task = arvados.current_task()

# Get the input file for the task
input_id, library_input_path = this_task['parameters']['input'].split('/', 1)

#open the input collection (containing library files, tilelocusannotation.csv, tilevariant.csv, and possibly genomevariant.csv)
library_input_collection = arvados.CollectionReader(input_id)

#only do work if we are given a library file as input!
if library_input_path.endswith('_library.csv'):
def main():
    ################################################################################
    # Phase I: Check inputs and setup sub tasks 1-N to process group(s) based on
    #          applying the capturing group named "group_by" in group_by_regex.
    #          (and terminate if this is task 0)
    ################################################################################
    ref_input_pdh = gatk_helper.prepare_gatk_reference_collection(
        reference_coll=arvados.current_job()['script_parameters']
        ['reference_collection'])
    job_input_pdh = arvados.current_job(
    )['script_parameters']['inputs_collection']
    interval_lists_pdh = arvados.current_job(
    )['script_parameters']['interval_lists_collection']
    interval_count = 1
    if "interval_count" in arvados.current_job()['script_parameters']:
        interval_count = arvados.current_job(
        )['script_parameters']['interval_count']

    # Setup sub tasks 1-N (and terminate if this is task 0)
    hgi_arvados.chunked_tasks_per_cram_file(
        ref_input_pdh,
        job_input_pdh,
        interval_lists_pdh,
        validate_task_output,
        if_sequence=0,
        and_end_task=True,
        reuse_tasks=False,
        oldest_git_commit_to_reuse='6ca726fc265f9e55765bf1fdf71b86285b8a0ff2',
        script="gatk-haplotypecaller-cram.py")

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert (this_task['sequence'] != 0)

    ################################################################################
    # Phase IIa: If we are a "reuse" task, just set our output and be done with it
    ################################################################################
    if 'reuse_job_task' in this_task['parameters']:
        print "This task's work was already done by JobTask %s" % this_task[
            'parameters']['reuse_job_task']
        exit(0)

    ################################################################################
    # Phase IIb: Call Haplotypes!
    ################################################################################
    ref_file = gatk_helper.mount_gatk_reference(ref_param="ref")
    interval_list_file = gatk_helper.mount_single_gatk_interval_list_input(
        interval_list_param="chunk")
    cram_file = gatk_helper.mount_gatk_cram_input(input_param="input")
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    out_dir = hgi_arvados.prepare_out_dir()
    out_filename = os.path.basename(cram_file_base) + "." + os.path.basename(
        interval_list_file) + ".vcf.gz"

    # because of a GATK bug, name cannot contain the string '.bcf' anywhere within it or we will get BCF output
    out_filename = out_filename.replace(".bcf", "._cf")

    # HaplotypeCaller!
    gatk_exit = gatk.haplotype_caller(ref_file, cram_file, interval_list_file,
                                      os.path.join(out_dir, out_filename))

    if gatk_exit != 0:
        print "ERROR: GATK exited with exit code %s (NOT WRITING OUTPUT)" % gatk_exit
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "GATK exited successfully, writing output to keep"

        # Write a new collection as output
        out = arvados.CollectionWriter()

        # Write out_dir to keep
        out.write_directory_tree(out_dir)

        # Commit the output to Keep.
        output_locator = out.finish()

        print "Task output written to keep, validating it"
        if validate_task_output(output_locator):
            print "Task output validated, setting output to %s" % (
                output_locator)

            # Use the resulting locator as the output for this task.
            this_task.set_output(output_locator)
        else:
            print "ERROR: Failed to validate task output (%s)" % (
                output_locator)
            arvados.api().job_tasks().update(
                uuid=arvados.current_task()['uuid'], body={
                    'success': False
                }).execute()
Ejemplo n.º 21
0
 def test_write_named_file(self):
     cwriter = arvados.CollectionWriter(self.api_client)
     with self.make_test_file() as testfile:
         cwriter.write_file(testfile.name, 'foo')
         self.assertEqual(cwriter.manifest_text(),
                          ". 098f6bcd4621d373cade4e832627b4f6+4 0:4:foo\n")
def main():

    this_job = arvados.current_job()

    # Setup sub tasks 1-N (and terminate if this is task 0)
    one_task_per_cram_file(if_sequence=0, and_end_task=True)

    # Get object representing the current task
    this_task = arvados.current_task()

    # We will never reach this point if we are in the 0th task
    assert(this_task['sequence'] != 0)

    # Get reference FASTA
    ref_file = None
    print "Mounting reference FASTA collection"
    ref_dir = arvados.get_task_param_mount('ref')

    for f in arvados.util.listdir_recursive(ref_dir):
        if re.search(r'\.fa$', f):
            ref_file = os.path.join(ref_dir, f)
    if ref_file is None:
        raise InvalidArgumentError("No reference fasta found in reference collection.")
    # Ensure we can read the reference file
    if not os.access(ref_file, os.R_OK):
        raise FileAccessError("reference FASTA file not readable: %s" % ref_file)
    # TODO: could check readability of .fai and .dict as well?

    # Get genome chunk intervals file
    # chunk_file = None
    # print "Mounting chunk collection"
    # chunk_dir = arvados.get_task_param_mount('chunk')

    # for f in arvados.util.listdir_recursive(chunk_dir):
    #     if re.search(r'\.region_list.txt$', f):
    #         chunk_file = os.path.join(chunk_dir, f)
    # if chunk_file is None:
    #     raise InvalidArgumentError("No chunk intervals file found in chunk collection.")
    # # Ensure we can read the chunk file
    # if not os.access(chunk_file, os.R_OK):
    #     raise FileAccessError("Chunk intervals file not readable: %s" % chunk_file)

    # Get single CRAM file for this task 
    input_dir = None
    print "Mounting task input collection"
    input_dir = arvados.get_task_param_mount('input')

    input_cram_files = []
    for f in arvados.util.listdir_recursive(input_dir):
        if re.search(r'\.cram$', f):
            stream_name, input_file_name = os.path.split(f)
            input_cram_files += [os.path.join(input_dir, f)]
    if len(input_cram_files) != 1:
        raise InvalidArgumentError("Expected exactly one cram file per task.")

    # There is only one CRAM file
    cram_file = input_cram_files[0]

    # Ensure we can read the CRAM file
    if not os.access(cram_file, os.R_OK):
        raise FileAccessError("CRAM file not readable: %s" % cram_file)

    # Ensure we have corresponding CRAI index and can read it as well
    cram_file_base, cram_file_ext = os.path.splitext(cram_file)
    assert(cram_file_ext == ".cram")
    crai_file = cram_file_base + ".crai"
    if not os.access(crai_file, os.R_OK):
        crai_file = cram_file_base + ".cram.crai"
        if not os.access(crai_file, os.R_OK):
            raise FileAccessError("No readable CRAM index file for CRAM file: %s" % cram_file)

    # Will write to out_dir, make sure it is empty
    out_dir = os.path.join(arvados.current_task().tmpdir, 'out')
    if os.path.exists(out_dir):
        old_out_dir = out_dir + ".old"
        print "Moving out_dir %s out of the way (to %s)" % (out_dir, old_out_dir) 
        try:
            os.rename(out_dir, old_out_dir)
        except:
            raise
    try:
        os.mkdir(out_dir)
        os.chdir(out_dir)
    except:
        raise
#    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + "." + os.path.basename(chunk_file) + ".g.bcf")
    out_file = os.path.join(out_dir, os.path.basename(cram_file_base) + ".g.bcf")

    bash_cmd_pipe = "samtools view -h -u -@ 1 -T %s %s | bcftools mpileup -t AD,INFO/AD -C50 -pm2 -F0.1 -d10000 --gvcf 1,2,3,4,5,10,15 -f %s -Ou - | bcftools view  -Ou | bcftools norm -f %s -Ob -o %s" % (ref_file, cram_file, ref_file, ref_file, out_file)

    # Call bcftools
    runner_p = subprocess.Popen(bash_cmd_pipe, 
        stdin=None,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        close_fds=True,
        shell=True)

    while runner_p.poll() is None:
        line = runner_p.stdout.readline()
        print "BCFTOOLS: %s" % line.rstrip()

    runner_exit = runner_p.wait()
    if runner_exit != 0:
        print "WARNING: runner exited with exit code %s" % runner_exit

    # Write a new collection as output
    out = arvados.CollectionWriter()

    # Write out_dir to keep
    out.write_directory_tree(out_dir, stream_name)

    # Commit the output to Keep.
    output_locator = out.finish()

    # Use the resulting locator as the output for this task.
    this_task.set_output(output_locator)
Ejemplo n.º 23
0
 def foo_writer(self, **kwargs):
     api_client = self.api_client_mock()
     writer = arvados.CollectionWriter(api_client, **kwargs)
     writer.start_new_file('foo')
     writer.write('foo')
     return writer
Ejemplo n.º 24
0
def main():
    # Get object representing the current task
    this_task = arvados.current_task()

    sort_by_r = re.compile(sort_by_regex)

    ################################################################################
    # Concatentate VCFs in numerically sorted order of sort_by_regex
    ################################################################################
    vcf_files = gatk_helper.mount_gatk_gvcf_inputs(inputs_param="inputs")
    out_dir = hgi_arvados.prepare_out_dir()
    output_prefix = arvados.current_job()['script_parameters']['output_prefix']
    out_file = output_prefix + ".vcf.gz"

    # Concatenate VCFs
    bcftools_concat_exit = bcftools.concat(
        sorted(vcf_files,
               key=lambda fn: int(re.search(sort_by_r, fn).group('sort_by'))),
        os.path.join(out_dir, out_file))

    if bcftools_concat_exit != 0:
        print "WARNING: bcftools concat exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_concat_exit
        arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                         body={
                                             'success': False
                                         }).execute()
    else:
        print "bcftools concat exited successfully, indexing"

        bcftools_index_exit = bcftools.index(os.path.join(out_dir, out_file))

        if bcftools_index_exit != 0:
            print "WARNING: bcftools index exited with exit code %s (NOT WRITING OUTPUT)" % bcftools_index_exit
            arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                             body={
                                                 'success': False
                                             }).execute()
        else:
            print "bcftools index exited successfully, writing output to keep"

            # Write a new collection as output
            out = arvados.CollectionWriter()

            # Write out_dir to keep
            out.write_directory_tree(out_dir)

            # Commit the output to Keep.
            output_locator = out.finish()

            if validate_task_output(output_locator):
                print "Task output validated, setting output to %s" % (
                    output_locator)

                # Use the resulting locator as the output for this task.
                this_task.set_output(output_locator)
            else:
                print "ERROR: Failed to validate task output (%s)" % (
                    output_locator)
                arvados.api().job_tasks().update(uuid=this_task['uuid'],
                                                 body={
                                                     'success': False
                                                 }).execute()