Esempio n. 1
0
def run_audit(docstore, sample_list):
    doc = FileDocStore(file_path=docstore)

    master_list = []
    with open(sample_list) as handle:
        for line in handle:
            master_list.append(line.rstrip())

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            if doc.size(entry) > 0:
                results[sample] = results.get(sample, []) + [entry['name']]
    for sample, files in results.items():
        print "%s (%s) %s" % (sample, len(files), "\t".join(files))

    for sample in master_list:
        if sample not in results or len(results[sample]) < 3:
            print "missing (%s)" % (len(results.get(sample, []))), sample
Esempio n. 2
0
def run_audit(docstore, sample_list):
    doc = FileDocStore(file_path=docstore)

    master_list = []
    with open(sample_list) as handle:
        for line in handle:
            master_list.append(line.rstrip())

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            if doc.size(entry) > 0:
                results[sample] = results.get(sample, []) + [entry['name']]
    for sample, files in results.items():
        print "%s (%s) %s" % (sample, len(files), "\t".join(files))

    for sample in master_list:
        if sample not in results or len(results[sample]) < 3:
            print "missing (%s)" % (len(results.get(sample, []))), sample
Esempio n. 3
0
def run_errors(docstore):
    doc = FileDocStore(file_path=docstore)

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True):
        if entry.get('state', 'ok') in ['error']:
            print entry
Esempio n. 4
0
def run_ls(docstore, size=False):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
            if size:
                print id, entry.get('name', id), doc.size(entry)            
            else:
                print id, entry.get('name', id)
Esempio n. 5
0
def run_ls(docstore, size=False):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
        if size:
            print id, entry.get('name', id), doc.size(entry)
        else:
            print id, entry.get('name', id)
Esempio n. 6
0
def run_timing(docstore):
    doc = FileDocStore(file_path=docstore)
    for id, entry in doc.filter():
        if 'job' in entry and 'job_metrics' in entry['job']:
            timing = None
            for met in entry['job']['job_metrics']:
                if met['name'] == 'runtime_seconds':
                    timing = met['raw_value']
            if timing is not None:
                print id, entry["name"], timing
Esempio n. 7
0
    def testNebulaLaunch(self):
        input = {
            "input_file_1" :
                Target("c39ded10-6073-11e4-9803-0800200c9a66"),
            "input_file_2" :
                Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        }
        parameters = {
            "tail_select" : {
                "lineNum" : 3
            }
        }

        doc = FileDocStore(
            file_path=get_abspath("../test_tmp/docstore")
        )
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/", doc,
            uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"]
        )
        logging.info("Creating Task")
        workflow = GalaxyWorkflow(ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga"))
        task = nebula.tasks.GalaxyWorkflowTask(
            "test_workflow",
            workflow,
            inputs=input,
            parameters=parameters
        )

        service = GalaxyService(
            docstore=doc,
            name="nosetest_galaxy",
            galaxy="bgruening/galaxy-stable:dev",
            port=20022
        )

        task_path = get_abspath("../test_tmp/test.tasks")
        service_path = get_abspath("../test_tmp/test.service")
        taskset = TaskGroup()
        taskset.append(task)
        with open(task_path, "w") as handle:
            taskset.store(handle)

        with open(service_path, "w") as handle:
            service.get_config().set_docstore_config(cache_path=get_abspath("../test_tmp/cache")).store(handle)

        env = dict(os.environ)
        if 'PYTHONPATH' in env:
            env['PYTHONPATH'] += ":" + get_abspath("../")
        else:
            env['PYTHONPATH'] = get_abspath("../")
        subprocess.check_call([get_abspath("../bin/nebula"), "run", service_path, task_path], env=env)

        for i in doc.filter():
            print json.dumps(i, indent=4)
Esempio n. 8
0
def run_ls(docstore, size=False, extra=[]):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
            extra = []
            for e in args.extra:
                extra.append( str(entry.get(e,"")) )
            if size:
                print id, entry.get('name', id), doc.size(entry), " ".join(extra)
            else:
                print id, entry.get('name', id), " ".join(extra)
Esempio n. 9
0
def run_errors(docstore):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        if entry.get('state', '') == 'error':
            print "Dataset", id, entry.get("tags", "")
            if 'provenance' in entry:
                print "tool:", entry['provenance']['tool_id']
                print "-=-=-=-=-=-=-"
            print entry['job']['stdout']
            print "-------------"
            print entry['job']['stderr']
            print "-=-=-=-=-=-=-"
Esempio n. 10
0
    def testNebulaLaunch(self):
        input = {
            "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"),
            "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        }
        parameters = {"tail_select": {"lineNum": 3}}

        doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore"))
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/",
                     doc,
                     uuid_set=[
                         "c39ded10-6073-11e4-9803-0800200c9a66",
                         "26fd12a2-9096-4af2-a989-9e2f1cb692fe"
                     ])
        logging.info("Creating Task")
        workflow = GalaxyWorkflow(
            ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga"))
        task = nebula.tasks.GalaxyWorkflowTask("test_workflow",
                                               workflow,
                                               inputs=input,
                                               parameters=parameters)

        service = GalaxyService(docstore=doc,
                                name="nosetest_galaxy",
                                galaxy="bgruening/galaxy-stable:dev",
                                port=20022)

        task_path = get_abspath("../test_tmp/test.tasks")
        service_path = get_abspath("../test_tmp/test.service")
        taskset = TaskGroup()
        taskset.append(task)
        with open(task_path, "w") as handle:
            taskset.store(handle)

        with open(service_path, "w") as handle:
            service.get_config().set_docstore_config(
                cache_path=get_abspath("../test_tmp/cache")).store(handle)

        env = dict(os.environ)
        if 'PYTHONPATH' in env:
            env['PYTHONPATH'] += ":" + get_abspath("../")
        else:
            env['PYTHONPATH'] = get_abspath("../")
        subprocess.check_call(
            [get_abspath("../bin/nebula"), "run", service_path, task_path],
            env=env)

        for i in doc.filter():
            print json.dumps(i, indent=4)
Esempio n. 11
0
 def testServiceDescription(self):
     store = FileDocStore("./test_tmp/docstore")
     service = nebula.service.GalaxyService(store)
     service_dict = service.to_dict()
     self.assertIn('service_type', service_dict)
     self.assertEqual('Galaxy', service_dict['service_type'])
     print service_dict
Esempio n. 12
0
    def testMesosLaunch(self):
        input_file_1 = Target("c39ded10-6073-11e4-9803-0800200c9a66"),
        input_file_2 = Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")

        doc = FileDocStore(file_path="./test_tmp/docstore")
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/",
                     doc,
                     uuid_set=[
                         "c39ded10-6073-11e4-9803-0800200c9a66",
                         "26fd12a2-9096-4af2-a989-9e2f1cb692fe"
                     ])

        task_1 = MD5Task(input_file_1)

        md5_service = nebula.service.md5_service.MD5Service(doc)

        sched = nebula.scheduler.Scheduler({})
        mesos = nebula.drms.mesos_runner.MesosDRMS(
            sched, {"mesos": "%s:%s" % (self.host_ip, CONFIG_PARENT_PORT)})
        mesos.start()
        mesos_md5_service = mesos.deploy_service(md5_service)
        job_1 = mesos_md5_service.submit(task_1)
        mesos_md5_service.wait([job_1])
        print job_1
        logging.info("Sleeping for 15")
        time.sleep(15)
        mesos.stop()
Esempio n. 13
0
 def testServiceGenerate(self):
     doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore"))
     service = GalaxyService(docstore=doc,
                             name="nosetest_galaxy",
                             galaxy="bgruening/galaxy-stable",
                             port=20022)
     json.dumps(service.to_dict())
Esempio n. 14
0
 def testServiceStart(self):
     store = FileDocStore("./test_tmp/docstore")
     self.service = nebula.service.GalaxyService(store,
                                                 name="nosetest_galaxy",
                                                 force=True,
                                                 port=20022)
     self.service.start()
     time.sleep(10)
     self.assertFalse(self.service.in_error())
Esempio n. 15
0
def run_query(docstore, fields, size, filters):
    doc = FileDocStore(file_path=docstore)

    filter = {}
    for k in filters:
        tmp=k.split("=")
        filter[tmp[0]] = tmp[1]

    for id, entry in doc.filter(**filter):

        if fields is None or len(fields) == 0:
            line = entry
        else:
            line = dict( (i, entry.get(i, "")) for i in fields )

        if size:
            size_value = doc.size(Target(uuid=entry['uuid']))
        else:
            size_value = ""
        
        print size_value, json.dumps(line)
Esempio n. 16
0
def run_synapse(docstore, parent, workdir):
    doc = FileDocStore(file_path=docstore)

    syn = synapseclient.Synapse()
    syn.login()

    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            name = entry['name']
            name = re.sub(r'.vcf$', '', name)
            file_name = sample + "." + name + ".snv_mnv.vcf"
            target = Target(uuid=entry['uuid'])
            if doc.size(target) > 0:
                src_file = doc.get_filename(target)
                dst_file = os.path.join(workdir, file_name)
                query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz")
                r = syn.query(query)['results']
                if len(r) == 0:
                    #print r
                    print dst_file
                    shutil.copy(src_file, dst_file)
                    subprocess.check_call("bgzip %s" % (dst_file), shell=True)
                    f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" )
                    f.fileType = 'vcf'
                    f.pipeline = 'UCSC'
                    f.variant_type = "snv"
                    f = syn.store(f,
                        executed="https://github.com/ucsccancer/pcawg_tools"
                    )
                else:
                    print "Skipping", file_name
Esempio n. 17
0
def run_synapse(docstore, parent, workdir):
    doc = FileDocStore(file_path=docstore)

    syn = synapseclient.Synapse()
    syn.login()

    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            name = entry['name']
            name = re.sub(r'.vcf$', '', name)
            file_name = sample + "." + name + ".snv_mnv.vcf"
            target = Target(uuid=entry['uuid'])
            if doc.size(target) > 0:
                src_file = doc.get_filename(target)
                dst_file = os.path.join(workdir, file_name)
                query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz")
                r = syn.query(query)['results']
                if len(r) == 0:
                    #print r
                    print dst_file
                    shutil.copy(src_file, dst_file)
                    subprocess.check_call("bgzip %s" % (dst_file), shell=True)
                    f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" )
                    f.fileType = 'vcf'
                    f.pipeline = 'UCSC'
                    f.variant_type = "snv"
                    f = syn.store(f,
                        executed="https://github.com/ucsccancer/pcawg_tools"
                    )
                else:
                    print "Skipping", file_name
Esempio n. 18
0
    def testToolTagging(self):

        doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore"))
        sync_doc_dir(get_abspath("../examples/simple_galaxy/"),
                     doc,
                     uuid_set=[
                         "c39ded10-6073-11e4-9803-0800200c9a66",
                         "26fd12a2-9096-4af2-a989-9e2f1cb692fe"
                     ])

        input_file_1 = Target(uuid="c39ded10-6073-11e4-9803-0800200c9a66")
        input_file_2 = Target(uuid="26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        workflow = GalaxyWorkflow(
            ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga"))
        task_tag = nebula.tasks.GalaxyWorkflowTask(
            "workflow_ok",
            workflow,
            inputs={
                'input_file_1': input_file_1,
                'input_file_2': input_file_2
            },
            parameters={"tail_select": {
                "lineNum": 3
            }},
            tags=["run:testing"],
            tool_tags={
                "tail_select": {
                    "out_file1": ["file:tail"]
                },
                "concat_out": {
                    "out_file1": ["file:output"]
                }
            })
        print "Starting Service"
        service = GalaxyService(docstore=doc,
                                name="nosetest_galaxy",
                                galaxy="bgruening/galaxy-stable:dev",
                                force=True,
                                port=20022)
        service.start()
        self.service = service
        job = service.submit(task_tag)
        print "JOB", job.get_status()
        service.wait([job])
        self.assertIn(job.get_status(), ['ok'])
        self.assertFalse(service.in_error())
        print service.in_error()
Esempio n. 19
0
def run_workflow(args):
    data_map = {}
    for meta_path in glob(os.path.join(args['lib_data'], "*.json")):
        data_path = re.sub(r'.json$', "", meta_path)
        if os.path.exists(data_path):
            try:
                with open(meta_path) as handle:
                    meta = json.loads(handle.read())
                    if 'uuid' in meta:
                        data_map[meta['uuid']] = data_path
            except:
                pass

    d_url = urlparse(args['doc_store'])
    if d_url.scheme == '':
        doc = FileDocStore(file_path=d_url.path)
    else:
        raise Exception("Object Store type not supported: %s" % (o_url.scheme))

    #this side happens on the master node
    tasks = {}
    task_request = {}
    input_uuids = {}
    for i, input_file in enumerate(args['inputs']):
        with open(input_file) as handle:
            meta = json.loads(handle.read())
        inputs = {}
        for k, v in meta.get('ds_map').items():
            input_uuids[v['uuid']] = True
            t = Target(v['uuid'])
            if not doc.exists(t):
                if t.uuid not in data_map:
                    raise Exception("Can't find input data: %s" % (t.uuid))
                doc.update_from_file(t, data_map[t.uuid], create=True)
                doc.put(t.uuid, t.to_dict())
            inputs[k] = t
        params = meta.get("parameters", {})
        task_name = 'task_%s' % (i)
        if args['workflow'] is not None:
            task = GalaxyWorkflow(task_name,
                                  args['workflow'],
                                  inputs=inputs,
                                  parameters=params,
                                  tags=meta.get("tags", None),
                                  galaxy=args['galaxy'],
                                  tool_dir=args['tool_dir'],
                                  tool_data=args['tool_data'])
        else:
            with open(args['yaml_workflow']) as handle:
                yaml_text = handle.read()
            task = GalaxyWorkflow(task_name,
                                  yaml=yaml_text,
                                  inputs=inputs,
                                  parameters=params,
                                  tags=meta.get("tags", None),
                                  docker=args['galaxy'],
                                  tool_dir=args['tools'],
                                  tool_data=args['tool_data'])
        task_request[task_name] = meta
        task_data = task.get_task_data()
        tasks[task_name] = task_data

    #this side happens on the worker node
    service = ServiceFactory('galaxy',
                             objectstore=doc,
                             lib_data=[doc.file_path],
                             tool_dir=args['tool_dir'],
                             tool_data=args['tool_data'],
                             galaxy=args['galaxy'],
                             config_dir=args['config_dir'],
                             sudo=args['sudo'],
                             force=True,
                             tool_docker=True,
                             smp=args['smp'],
                             cpus=args['cpus'],
                             work_dir=args['work_dir'])
    service.start()
    task_job_ids = {}
    for task_name, task_data in tasks.items():
        task = TaskJob(task_data)
        i = service.submit(task)
        task_job_ids[task_name] = i

    sleep_time = 1
    while True:
        waiting = False
        for i in task_job_ids.values():
            status = service.status(i)
            logging.info("Status check %s %s" % (status, i))
            if status not in ['ok', 'error']:
                waiting = True
        if not waiting:
            break
        time.sleep(sleep_time)
        if sleep_time < 60:
            sleep_time += 1

    #move the output data into the datastore
    for task_name, i in task_job_ids.items():
        job = service.get_job(i)
        if job.error is None:
            for a in job.get_outputs():
                meta = service.get_meta(a)
                #if 'tags' in task_request[task_name]:
                #    meta["tags"] = task_request[task_name]["tags"]
                #print "meta!!!", json.dumps(meta, indent=4)
                doc.put(meta['uuid'], meta)
                if meta.get('visible', True):
                    if meta['state'] == "ok":
                        if meta['uuid'] not in input_uuids:
                            logging.info("Downloading: %s" % (meta['uuid']))
                            service.store_data(a, doc)
                        else:
                            logging.info("Skipping input file %s" % (a))
                    else:
                        logging.info("Skipping non-ok file: %s" %
                                     (meta['state']))
                else:
                    logging.info("Skipping Download %s (not visible)" % (a))

    logging.info("Done")
    if not args['hold']:
        service.stop()
Esempio n. 20
0
def run_get(docstore, uuid, outpath):
    doc = FileDocStore(file_path=docstore)
    print doc.get_filename(Target(uuid=uuid))
Esempio n. 21
0
    def testRunSimple(self):
        input = {
            "input_file_1" :
                Target("c39ded10-6073-11e4-9803-0800200c9a66"),
            "input_file_2" :
                Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        }
        parameters = {
            "tail_select" : {
                "lineNum" : 3
            }
        }
        bad_parameters = dict(parameters)
        del bad_parameters['tail_select']

        doc = FileDocStore(file_path="./test_tmp/docstore")
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/", doc,
            uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"]
        )
        logging.info("Creating Task")
        workflow = GalaxyWorkflow(ga_file="examples/simple_galaxy/SimpleWorkflow.ga")
        task = nebula.tasks.GalaxyWorkflowTask(
            "test_workflow", workflow,
            inputs=input,
            parameters=parameters
        )

        task_data = task.to_dict()
        #make sure the task data can be serialized
        task_data_str = json.dumps(task_data)

        service = GalaxyService(
            docstore=doc,
            name="nosetest_galaxy",
            galaxy="bgruening/galaxy-stable",
            force=True,
            port=20022
        )
        self.service = service

        #make sure the generated task is serializable
        new_task_data = json.loads(task_data_str)
        new_task = nebula.tasks.from_dict(new_task_data)

        logging.info("Starting Service")
        print "Starting service"
        service.start()
        self.assertFalse( service.in_error() )
        logging.info("Starting Tasks")
        job = service.submit(new_task)
        self.assertTrue( isinstance(job, TaskJob) )
        self.assertFalse( service.in_error() )
        #logging.info("Waiting")
        service.wait([job])
        self.assertIn(job.get_status(), ['ok'])

        bad_task = nebula.tasks.GalaxyWorkflowTask(
            "test_workflow_bad",
            workflow,
            inputs=input,
            parameters=bad_parameters
        )
        job = service.submit(bad_task)
        service.wait([job])
        self.assertIn(job.get_status(), ['error'])

        self.assertFalse( service.in_error() )
Esempio n. 22
0
def run_copy(docstore, out_docstore):
    doc = FileDocStore(file_path=docstore)

    out_doc = FileDocStore(file_path=out_docstore)

    for id, entry in doc.filter():
        if out_doc.get(id) is None:
            print "copy", id
            out_doc.put(id, entry)
            if doc.exists(entry):
                src_path = doc.get_filename(entry)
                out_doc.create(entry)
                dst_path = out_doc.get_filename(entry)
                shutil.copy(src_path, dst_path)
                out_doc.update_from_file(entry)
        else:
            #print "skip", id, doc.size(entry), out_doc.size(entry)
            if doc.size(entry) != out_doc.size(entry):
                print "mismatch", id
Esempio n. 23
0
def run_scan(docstore, workdir, keyfile, upload_url, manifest):
    doc = FileDocStore(file_path=docstore)

    file_map = {
        'broad' : {},
        'muse' : {}
    }

    wl_map = {}
    with open(manifest) as handle:
        reader = csv.DictReader(handle, delimiter="\t")
        for row in reader:
            wl_map[row['Donor_ID']] = row

    for id, entry in doc.filter(visible=True):
        if entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
            if 'tags' in entry:
                sample = None
                for s in entry['tags']:
                    tmp = s.split(":")
                    if tmp[0] == 'sample':
                        sample = tmp[1]

                pipeline = None
                method = None
                call_type = None
                variant_type = None
                if entry['name'] in ['MUSE_1.0rc', 'MUSE_0.9.9.5']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exeception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (sample, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                print file_name
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(workdir, file_name)
                    shutil.copy(src_file, dst_file)

                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip %s" % dst_file, shell=True )
                        dst_file = dst_file + ".gz"

                    subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True)
                    shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file))
                    subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True)
                    subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True)

                    if sample not in file_map[pipeline]:
                        file_map[pipeline][sample] = []

                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][sample].append(input_file)

    for pipeline, samples in file_map.items():
        for sample, files in samples.items():
            with open( os.path.join(workdir, "%s.%s.sh" %(pipeline, sample)), "w" ) as handle:
                input_file = os.path.basename(dst_file)
                urls = [
                    "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Normal_GNOS_endpoint'], wl_map[sample]['Normal_Analysis_ID']),
                    "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Tumour_GNOS_endpoint'], wl_map[sample]['Tumour_Analysis_ID'])
                ]
                cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl"
                cmd_str += " --metadata-urls %s" % (",".join(urls))
                cmd_str += " --vcfs %s " % (",".join(files))
                cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                cmd_str += " --outdir %s.%s.dir" % (pipeline, sample)
                cmd_str += " --key %s " % (keyfile)
                cmd_str += " --upload-url %s" % (upload_url)
                cmd_str += " --study-refname-override tcga_pancancer_vcf_test"

                handle.write("""#!/bin/bash

%s

""" % (cmd_str) )
Esempio n. 24
0
def run_workflow(args):
    data_map = {}
    for meta_path in glob(os.path.join(args['lib_data'], "*.json")):
        data_path = re.sub(r'.json$', "", meta_path)
        if os.path.exists(data_path):
            try:
                with open(meta_path) as handle:
                    meta = json.loads(handle.read())
                    if 'uuid' in meta:
                        data_map[meta['uuid']] = data_path
            except:
                pass

    d_url = urlparse(args['doc_store'])
    if d_url.scheme == '':
        doc = FileDocStore(file_path=d_url.path)
    else:
        raise Exception("Object Store type not supported: %s" % (o_url.scheme))


    #this side happens on the master node
    tasks = {}
    task_request = {}
    input_uuids = {}
    for i, input_file in enumerate(args['inputs']):
        with open(input_file) as handle:
            meta = json.loads(handle.read())
        inputs = {}
        for k, v in meta.get('ds_map').items():
            input_uuids[v['uuid']] = True
            t = Target(v['uuid'])
            if not doc.exists(t):
                if t.uuid not in data_map:
                    raise Exception("Can't find input data: %s" % (t.uuid))
                doc.update_from_file(t, data_map[t.uuid], create=True)
                doc.put(t.uuid, t.to_dict())
            inputs[k] = t
        params = meta.get("parameters", {})
        task_name = 'task_%s' % (i)
        if args['workflow'] is not None:
            task = GalaxyWorkflow(task_name, args['workflow'],
                inputs=inputs, parameters=params, tags=meta.get("tags", None),
                galaxy=args['galaxy'], tool_dir=args['tool_dir'], tool_data=args['tool_data'])
        else:
            with open(args['yaml_workflow']) as handle:
                yaml_text = handle.read()
            task = GalaxyWorkflow(task_name, yaml=yaml_text, inputs=inputs, parameters=params, tags=meta.get("tags", None), docker=args['galaxy'], tool_dir=args['tools'], tool_data=args['tool_data'])
        task_request[task_name] = meta
        task_data = task.get_task_data()
        tasks[task_name] = task_data

    #this side happens on the worker node
    service = ServiceFactory('galaxy', objectstore=doc,
        lib_data=[doc.file_path], tool_dir=args['tool_dir'], tool_data=args['tool_data'],
        galaxy=args['galaxy'], config_dir=args['config_dir'], sudo=args['sudo'], force=True,
        tool_docker=True, smp=args['smp'], cpus=args['cpus'], work_dir=args['work_dir'])
    service.start()
    task_job_ids = {}
    for task_name, task_data in tasks.items():
        task = TaskJob(task_data)
        i = service.submit(task)
        task_job_ids[task_name] = i

    sleep_time = 1
    while True:
        waiting = False
        for i in task_job_ids.values():
            status = service.status(i)
            logging.info("Status check %s %s" % (status, i))
            if status not in ['ok', 'error']:
                waiting = True
        if not waiting:
            break
        time.sleep(sleep_time)
        if sleep_time < 60:
            sleep_time += 1

    #move the output data into the datastore
    for task_name, i in task_job_ids.items():
        job = service.get_job(i)
        if job.error is None:
            for a in job.get_outputs():
                meta = service.get_meta(a)
                #if 'tags' in task_request[task_name]:
                #    meta["tags"] = task_request[task_name]["tags"]
                #print "meta!!!", json.dumps(meta, indent=4)
                doc.put(meta['uuid'], meta)
                if meta.get('visible', True):
                    if meta['state'] == "ok":
                        if meta['uuid'] not in input_uuids:
                            logging.info("Downloading: %s" % (meta['uuid']))
                            service.store_data(a, doc)
                        else:
                            logging.info("Skipping input file %s" % (a))
                    else:
                        logging.info("Skipping non-ok file: %s" % (meta['state']))
                else:
                    logging.info("Skipping Download %s (not visible)" % (a))

    logging.info("Done")
    if not args['hold']:
        service.stop()