def testWorkflow(self): input_file_1 = Target(uuid="c39ded10-6073-11e4-9803-0800200c9a66") input_file_2 = Target(uuid="26fd12a2-9096-4af2-a989-9e2f1cb692fe") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask( "workflow_test", workflow, inputs={ 'input_file_1': input_file_1, 'input_file_2': input_file_2 }, parameters={"tail_select": { "lineNum": 3 }}) task_data = task.to_dict() task_data_str = json.dumps(task_data) new_task_data = json.loads(task_data_str) new_task = nebula.tasks.from_dict(new_task_data) self.assertEqual(len(task.get_inputs()), len(new_task.get_inputs())) task_inputs = task.get_inputs() new_task_inputs = new_task.get_inputs() for k, v in task_inputs.items(): self.assertIn(k, new_task_inputs) self.assertEqual(v, new_task_inputs[k])
def testWorkflowCheck(self): input_file_1 = Target(uuid="c39ded10-6073-11e4-9803-0800200c9a66") input_file_2 = Target(uuid="26fd12a2-9096-4af2-a989-9e2f1cb692fe") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task_ok = nebula.tasks.GalaxyWorkflowTask( "workflow_ok", workflow, inputs={ 'input_file_1': input_file_1, 'input_file_2': input_file_2 }, parameters={"tail_select": { "lineNum": 3 }}) task_missing = nebula.tasks.GalaxyWorkflowTask("workflow_broken", workflow, inputs={ 'input_file_1': input_file_1, "tail_select": { "lineNum": 3 } }) self.assertTrue(task_ok.is_valid()) self.assertFalse(task_missing.is_valid())
def testMesosLaunch(self): input_file_1 = Target("c39ded10-6073-11e4-9803-0800200c9a66"), input_file_2 = Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") doc = FileDocStore(file_path="./test_tmp/docstore") logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) task_1 = MD5Task(input_file_1) md5_service = nebula.service.md5_service.MD5Service(doc) sched = nebula.scheduler.Scheduler({}) mesos = nebula.drms.mesos_runner.MesosDRMS( sched, {"mesos": "%s:%s" % (self.host_ip, CONFIG_PARENT_PORT)}) mesos.start() mesos_md5_service = mesos.deploy_service(md5_service) job_1 = mesos_md5_service.submit(task_1) mesos_md5_service.wait([job_1]) print job_1 logging.info("Sleeping for 15") time.sleep(15) mesos.stop()
def testNebulaLaunch(self): input = { "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = {"tail_select": {"lineNum": 3}} doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) logging.info("Creating Task") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask("test_workflow", workflow, inputs=input, parameters=parameters) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", port=20022) task_path = get_abspath("../test_tmp/test.tasks") service_path = get_abspath("../test_tmp/test.service") taskset = TaskGroup() taskset.append(task) with open(task_path, "w") as handle: taskset.store(handle) with open(service_path, "w") as handle: service.get_config().set_docstore_config( cache_path=get_abspath("../test_tmp/cache")).store(handle) env = dict(os.environ) if 'PYTHONPATH' in env: env['PYTHONPATH'] += ":" + get_abspath("../") else: env['PYTHONPATH'] = get_abspath("../") subprocess.check_call( [get_abspath("../bin/nebula"), "run", service_path, task_path], env=env) for i in doc.filter(): print json.dumps(i, indent=4)
def syn_sync(syn, project, docstore, filter=None): #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (project)): if filter is None or a['entity.name'] in filter or a[ 'entity.name'].replace(".gz", "") in filter: ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta)
def run_upload(args): #syn = synapseclient.Synapse() #syn.login() docstore = from_url(args.out_base) donor_map = {} bam_map = {} for id, doc in docstore.filter( visible=True, state='ok', name=['OUTPUT_BAM_1', 'OUTPUT_BAM_2', 'OUTPUT_BAM_3']): print doc['name'], doc['tags'] for t in doc['tags']: ts = t.split(":") if ts[0] == 'donor': if ts[1] not in donor_map: donor_map[ts[1]] = [] donor_map[ts[1]].append(id) if ts[0] == 'original_bam': bam_map[ts[1]] = id if not os.path.exists(args.out): os.mkdir(args.out) for key, value in bam_map.items(): t = Target(uuid=value) path = docstore.get_filename(t) print "%s\t%s" % (value, path) os.symlink(path, os.path.join(args.out, "MC3." + key + ".bam"))
def action_cp(args): rg = nebula.warpdrive.RemoteGalaxy(args.url, args.api_key) if not args.dir: docstore = from_url(args.dst) else: if not os.path.exists(args.dst): os.mkdir(args.dst) for hda in rg.get_history_contents(args.src): if hda['visible']: if args.filter is None or re.search(args.filter, hda['name']): if hda['name'] not in args.exclude: print hda['name'] meta = rg.get_dataset(hda['id'], 'hda') if args.dir: dst_path = os.path.join(args.dst, hda['name']) rg.download(meta['download_url'], dst_path) else: meta['id'] = meta['uuid'] #use the glocal id hda = Target(uuid=meta['uuid']) docstore.create(hda) path = docstore.get_filename(hda) rg.download(meta['download_url'], path) docstore.update_from_file(hda)
def run_audit(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) donor_map = {} for id, ent in docstore.filter(state="ok"): if ent['visible']: if docstore.size(Target(id)) > 0: donor = None for i in ent['tags']: t = i.split(":") if t[0] == "donor": donor = t[1] if donor not in donor_map: donor_map[donor] = {} donor_map[donor][ent['name']] = id for ent in synqueue.listAssignments(syn, list_all=True, **config): if ent['meta']['Submitter_donor_ID'] in donor_map: print ent['meta']['Submitter_donor_ID'], len( donor_map[ent['meta']['Submitter_donor_ID']])
def testToolTagging(self): doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore")) sync_doc_dir(get_abspath("../examples/simple_galaxy/"), doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) input_file_1 = Target(uuid="c39ded10-6073-11e4-9803-0800200c9a66") input_file_2 = Target(uuid="26fd12a2-9096-4af2-a989-9e2f1cb692fe") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task_tag = nebula.tasks.GalaxyWorkflowTask( "workflow_ok", workflow, inputs={ 'input_file_1': input_file_1, 'input_file_2': input_file_2 }, parameters={"tail_select": { "lineNum": 3 }}, tags=["run:testing"], tool_tags={ "tail_select": { "out_file1": ["file:tail"] }, "concat_out": { "out_file1": ["file:output"] } }) print "Starting Service" service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", force=True, port=20022) service.start() self.service = service job = service.submit(task_tag) print "JOB", job.get_status() service.wait([job]) self.assertIn(job.get_status(), ['ok']) self.assertFalse(service.in_error()) print service.in_error()
def from_dict(data): request = {} for k,v in data['inputs'].items(): if isinstance(v,dict) and 'uuid' in v: request[k] = Target(uuid=v['uuid']) else: request[k] = v return GalaxyWorkflowTask( data['task_id'], workflow=GalaxyWorkflow(data['workflow']), inputs=request, parameters=data.get('parameters', None), tags=data.get('tags', None), tool_tags=data.get('tool_tags', None) )
def testDocStore(self): docstore = nebula.docstore.from_url( get_abspath("../test_tmp/docstore")) f_uuid = "c39ded10-6073-11e4-9803-0800200c9a66" t = Target(f_uuid) docstore.update_from_file( t, get_abspath("../examples/simple_galaxy/P04637.fasta"), create=True) self.assertEqual( docstore.size(t), os.stat( get_abspath("../examples/simple_galaxy/P04637.fasta")).st_size)
def testCaching(self): docstore_1 = nebula.docstore.FileDocStore( get_abspath("../test_tmp/docstore"), cache_path=get_abspath("../test_tmp/cache_1")) f_uuid = "c39ded10-6073-11e4-9803-0800200c9a66" t = Target(f_uuid) docstore_1.update_from_file( t, get_abspath("../examples/simple_galaxy/P04637.fasta"), create=True) docstore_2 = nebula.docstore.FileDocStore( get_abspath("../test_tmp/docstore"), cache_path=get_abspath("../test_tmp/cache_2")) self.assertTrue(docstore_2.exists(t)) print docstore_2.get_filename(t)
def sync_doc_dir(path, docstore, uuid_set=None, filter=None): data_map = scan_doc_dir(path) #print "Scanned", path, data_map for uuid, path in data_map.items(): t = Target(uuid) if not docstore.exists(t): copy = True if uuid_set is not None and uuid not in uuid_set: copy = False if filter is not None: with open(path + ".json") as handle: meta = json.loads(handle.read()) if not filter(meta): copy = False if copy: logging.info("Adding file: %s" % (path)) docstore.update_from_file(t, path, create=True) with open(path + ".json") as handle: meta = json.loads(handle.read()) docstore.put(t.id, meta)
def run_extract(args): docstore = from_url(args.out_base) for id, ent in docstore.filter(file_ext="vcf", name=[ "muse.vcf", "pindel.vcf", "radia.dna-rna.vcf", "radia.dna.vcf", "somatic_sniper.vcf", "varscan.indel.vcf", "varscan.snp.vcf", "mutect.vcf" ]): t = Target(uuid=ent['id']) if docstore.size(t) > 0: donor = None for e in ent['tags']: tmp = e.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None: donor_dir = os.path.join(args.out_dir, donor) if not os.path.exists(donor_dir): os.makedirs(donor_dir) print "Found", donor, ent['name'] shutil.copy( docstore.get_filename(t), os.path.join(donor_dir, ent['name']) )
def run_query(docstore, fields, size, filters): doc = FileDocStore(file_path=docstore) filter = {} for k in filters: tmp=k.split("=") filter[tmp[0]] = tmp[1] for id, entry in doc.filter(**filter): if fields is None or len(fields) == 0: line = entry else: line = dict( (i, entry.get(i, "")) for i in fields ) if size: size_value = doc.size(Target(uuid=entry['uuid'])) else: size_value = "" print size_value, json.dumps(line)
def run_synapse(docstore, parent, workdir): doc = FileDocStore(file_path=docstore) syn = synapseclient.Synapse() syn.login() for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'): if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] name = entry['name'] name = re.sub(r'.vcf$', '', name) file_name = sample + "." + name + ".snv_mnv.vcf" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz") r = syn.query(query)['results'] if len(r) == 0: #print r print dst_file shutil.copy(src_file, dst_file) subprocess.check_call("bgzip %s" % (dst_file), shell=True) f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" ) f.fileType = 'vcf' f.pipeline = 'UCSC' f.variant_type = "snv" f = syn.store(f, executed="https://github.com/ucsccancer/pcawg_tools" ) else: print "Skipping", file_name
def run_uploadprep(args): if not os.path.exists(args.workdir): os.mkdir(args.workdir) doc = from_url(args.out_base) file_map = {'broad': {}, 'muse': {}, 'broad_tar': {}} syn = synapseclient.Synapse() syn.login() wl_map = {} job_map = {} for ent in synqueue.listAssignments(syn, list_all=True, **config): wl_map[ent['id']] = ent['meta'] uuid_map = {} uuid_map['broad'] = synqueue.getValues(syn, "Broad_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['muse'] = synqueue.getValues(syn, "Muse_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['broad_tar'] = synqueue.getValues( syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config) #scan through all of the docs for id, entry in doc.filter(): donor = None #look for docs with donor tags if 'tags' in entry and 'state' in entry and entry['state'] == 'ok': for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None and donor in wl_map: if donor not in job_map: job_map[donor] = {} #scan out the job metrics for this job if 'job' in entry and 'job_metrics' in entry['job']: job_id = entry['job']['id'] tool_id = entry['job']['tool_id'] job_info = {tool_id: {}} for met in entry['job']['job_metrics']: job_info[tool_id][met['name']] = met['raw_value'] job_map[donor][job_id] = job_info donor_tumor = wl_map[donor][ 'Tumour_WGS_alignment_GNOS_analysis_IDs'] #look for the vcf output files if entry.get('visible', False) and entry.get( 'extension', None) in ["vcf", "vcf_bgzip"]: pipeline = None method = None call_type = None variant_type = None #fill out the info depending on which caller created the file if entry['name'].split('.')[0] in ['MUSE_1']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in [ 'broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr, variant_type, call_type) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(args.workdir, file_name) shutil.copy(src_file, dst_file) #if the files wasn't compressed already, go ahead and do that if entry['extension'] == 'vcf': subprocess.check_call("bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True) dst_file = dst_file + ".gz" #add file to output map if donor not in file_map[pipeline]: file_map[pipeline][donor] = [] input_file = os.path.basename(dst_file) file_map[pipeline][donor].append(input_file) else: if entry['name'] == "broad.tar.gz": target = Target(uuid=entry['uuid']) src_file = doc.get_filename(target) file_map['broad_tar'][donor] = [src_file] timing_map = {} for donor in job_map: timing_map[donor] = {} for job_id in job_map[donor]: for tool_id in job_map[donor][job_id]: if tool_id not in timing_map[donor]: timing_map[donor][tool_id] = [] timing_map[donor][tool_id].append( job_map[donor][job_id][tool_id]) result_counts = {} for pipeline, donors in file_map.items(): for donor in donors: result_counts[donor] = result_counts.get(donor, 0) + 1 #go through every pipeline for pipeline, donors in file_map.items(): #for that pipeline go through every donor for donor, files in donors.items(): #we're only outputing data for donors on the work list if donor in wl_map and result_counts[donor] == 3: #output the timing json timing_json = os.path.abspath( os.path.join(args.workdir, "%s.%s.timing.json" % (pipeline, donor))) with open(timing_json, "w") as handle: handle.write(json.dumps(timing_map[donor])) #output the uploader script with open( os.path.join(args.workdir, "%s.%s.sh" % (pipeline, donor)), "w") as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID'] ), "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor] ['Tumour_WGS_alignment_GNOS_analysis_IDs']) ] donor_tumor = wl_map[donor][ 'Tumour_WGS_alignment_GNOS_analysis_IDs'] if pipeline in ['broad', 'muse']: prep_cmd_str = "" for vcf in files: prep_cmd_str += "tabix -p vcf %s\n" % (vcf) prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf, vcf) prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % ( vcf, vcf) prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % ( vcf, vcf) related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % ( ",".join(urls)) submit_cmd_str += " --vcfs %s " % (",".join(files)) submit_cmd_str += " --vcf-md5sum-files %s " % ( (",".join(("%s.md5" % i for i in files)))) submit_cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files)))) submit_cmd_str += " --vcf-idx-md5sum-files %s" % ( (",".join(("%s.idx.md5" % i for i in files)))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % ( args.upload_url) submit_cmd_str += " --study-refname-override %s" % ( args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --vm-location-code %s" % args.vm_location_code submit_cmd_str += " --timing-metrics-json %s" % ( timing_json) submit_cmd_str += " --workflow-file-subset %s" % ( pipeline) submit_cmd_str += " --related-file-subset-uuids %s" % ( ",".join(related_uuids)) submit_cmd_str += " --uuid %s" % ( uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" if pipeline in ['broad_tar']: prep_cmd_str = "" new_files = [] for tar in files: basename = donor_tumor + ".broad.intermediate" prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % ( os.path.dirname(os.path.abspath(__file__)), tar, "./", basename, donor, donor_tumor) new_files.append(basename + ".tar") related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % ( ",".join(urls)) submit_cmd_str += " --tarballs %s " % ( ",".join(new_files)) submit_cmd_str += " --tarball-md5sum-files %s " % ( (",".join(("%s.md5" % i for i in new_files)))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % ( args.upload_url) submit_cmd_str += " --study-refname-override %s" % ( args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --workflow-file-subset %s" % ( pipeline) submit_cmd_str += " --timing-metrics-json %s" % ( timing_json) submit_cmd_str += " --related-file-subset-uuids %s" % ( ",".join(related_uuids)) submit_cmd_str += " --uuid %s" % ( uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" handle.write( string.Template("""#!/bin/bash set -ex ${PREP} ${SUBMIT} echo $$? > $$0.submitted #pushd ${SUBMIT_DIR} #gtupload -v -c ${KEY} -u ./manifest.xml #ECODE=$$? #popd #echo $$ECODE > $$0.uploaded """).substitute(PREP=prep_cmd_str, SUBMIT=submit_cmd_str, SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf", pipeline + "." + donor_tumor + ".dir", uuid_map[pipeline][donor]), KEY=args.keyfile))
def run_scan(docstore, workdir, keyfile, upload_url, manifest): doc = FileDocStore(file_path=docstore) file_map = { 'broad' : {}, 'muse' : {} } wl_map = {} with open(manifest) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: wl_map[row['Donor_ID']] = row for id, entry in doc.filter(visible=True): if entry.get('extension', None) in ["vcf", "vcf_bgzip"]: if 'tags' in entry: sample = None for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'sample': sample = tmp[1] pipeline = None method = None call_type = None variant_type = None if entry['name'] in ['MUSE_1.0rc', 'MUSE_0.9.9.5']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exeception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (sample, method, datestr, variant_type, call_type ) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" print file_name target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(workdir, file_name) shutil.copy(src_file, dst_file) if entry['extension'] == 'vcf': subprocess.check_call( "bgzip %s" % dst_file, shell=True ) dst_file = dst_file + ".gz" subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True) shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file)) subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True) subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True) if sample not in file_map[pipeline]: file_map[pipeline][sample] = [] input_file = os.path.basename(dst_file) file_map[pipeline][sample].append(input_file) for pipeline, samples in file_map.items(): for sample, files in samples.items(): with open( os.path.join(workdir, "%s.%s.sh" %(pipeline, sample)), "w" ) as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Normal_GNOS_endpoint'], wl_map[sample]['Normal_Analysis_ID']), "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Tumour_GNOS_endpoint'], wl_map[sample]['Tumour_Analysis_ID']) ] cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl" cmd_str += " --metadata-urls %s" % (",".join(urls)) cmd_str += " --vcfs %s " % (",".join(files)) cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) ))) cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) ))) cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) ))) cmd_str += " --outdir %s.%s.dir" % (pipeline, sample) cmd_str += " --key %s " % (keyfile) cmd_str += " --upload-url %s" % (upload_url) cmd_str += " --study-refname-override tcga_pancancer_vcf_test" handle.write("""#!/bin/bash %s """ % (cmd_str) )
def run_gen(args): syn = synapseclient.Synapse() syn.login() if args.alt_table is not None: config['table_id'] = args.alt_table docstore = from_url(args.out_base) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) data_mapping = { "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf" } dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow_2 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga") workflow_3 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga") ref_rename = {"HG19_Broad_variant": "Homo_sapiens_assembly19"} tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring)) ref_set = set(a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring)) assert (len(ref_set) == 1) ref_name = ref_set.pop() if ref_name in ref_rename: ref_name = ref_rename[ref_name] hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) workflow_dm = dict(dm) workflow_dm['reference_genome'] = {"uuid": hit} if len(bam_set) == 2: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_2, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] } }) tasks.append(task) elif len(bam_set) == 3: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_3, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_3': { "uuid": bam_set[2], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] }, "BQSR_3": { "output_bam": ["original_bam:%s" % (bam_set[2])] } }) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=True, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[["gatk_bqsr", 12], ["gatk_indel", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def testWorkflowCaching(self): input = { "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = {"tail_select": {"lineNum": 3}} doc = nebula.docstore.FileDocStore( get_abspath("../test_tmp/docstore"), cache_path=get_abspath("../test_tmp/cache")) logging.info("Adding files to object store") sync_doc_dir(get_abspath("../examples/simple_galaxy/"), doc, uuid_set=[ "c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe" ]) logging.info("Creating Task") workflow = GalaxyWorkflow( ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga")) task = nebula.tasks.GalaxyWorkflowTask("test_workflow", workflow, inputs=input, parameters=parameters, tags=["run:testing"], tool_tags={ "tail_select": { "out_file1": ["file:tail"] }, "concat_out": { "out_file1": ["file:output"] } }) service = GalaxyService(docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable:dev", force=True, port=20022) self.service = service logging.info("Starting Service") print "Starting service" service.start() self.assertFalse(service.in_error()) logging.info("Starting Tasks") job = service.submit(task) self.assertTrue(isinstance(job, TaskJob)) self.assertFalse(service.in_error()) #logging.info("Waiting") service.wait([job]) found = False for id, info in doc.filter(tags="file:output"): logging.info("Found result object: %s size: %d" % (id, doc.size(info))) self.assertTrue(doc.size(info) > 0) found = True self.assertTrue(found) self.assertFalse(service.in_error()) self.assertIn(job.get_status(), ['ok'])
def runService(self): #FIXME: the 'file_path' value is specific to the DiskObjectStore docstore_path = self.docstore.file_path if 'lib_data' in self.config: self.config['lib_data'].append(self.docstore.local_cache_base()) else: self.config['lib_data'] = [self.docstore.local_cache_base()] self.rg = run_up(**self.config) library_id = self.rg.library_find("Imported")['id'] folder_id = self.rg.library_find_contents(library_id, "/")['id'] self.ready = True logging.info("Galaxy Running") while self.running: time.sleep(3) req = self.get_queued() if req is not None: logging.info("Received task request") uuid_ldda_map = {} with self.queue_lock: job_id, job = req wids = [] for k, v in job.get_inputs().items(): file_path = self.docstore.get_filename(Target(v.id)) file_meta = self.docstore.get(v.id) file_name = v.id if 'name' in file_meta: file_name = file_meta['name'] logging.info("Loading FilePath: %s (%s) %s" % (v.id, file_name, file_path)) nli = self.rg.library_paste_file( library_id=library_id, library_folder_id=folder_id, name=file_name, datapath=file_path, uuid=v.uuid) if 'id' not in nli: raise Exception("Failed to load data: %s" % (str(nli))) wids.append(nli['id']) uuid_ldda_map[v.uuid] = nli['id'] #wait for the uploading of the files to finish while True: done = True for w in wids: d = self.rg.library_get_contents(library_id, w) if d['state'] == 'error': raise Exception("Data loading Error") if d['state'] != 'ok': logging.debug("Data loading: %s" % (d['state'])) done = False break if done: break time.sleep(2) workflow_data = job.task.to_dict()['workflow'] logging.info("Loading Workflow: %s" % (workflow_data['uuid'])) self.rg.add_workflow(workflow_data) wf = GalaxyWorkflow(workflow_data) print "uuid_map", uuid_ldda_map request = job.task.get_workflow_request(uuid_ldda_map) print "Calling Workflow", json.dumps(request) invc = self.rg.call_workflow(request=request) print "Called Workflow", json.dumps(invc) if 'err_msg' in invc: logging.error("Workflow invocation failed") job.set_error("Workflow Invocation Failed") else: job.history = invc['history'] job.instance_id = invc['uuid'] job.outputs = {} job.hidden = {} wf_outputs = wf.get_outputs() for step in invc['steps']: if 'outputs' in step: step_name = step['workflow_step_label'] if step[ 'workflow_step_label'] is not None else str( step['workflow_step_uuid']) for ok, ov in step['outputs'].items(): output_name = "%s|%s" % (step_name, ok) if output_name in wf_outputs: #filter out produced items that are not part of the final output job.outputs[output_name] = ov else: job.hidden[output_name] = ov down_config = {} #if "work_dir" in self.config: # down_config['work_dir'] = self.config['work_dir'] run_down(name=self.config['name'], rm=True, sudo=self.config.get("sudo", False), **down_config)
def run_get(docstore, uuid, outpath): doc = FileDocStore(file_path=docstore) print doc.get_filename(Target(uuid=uuid))
def run_gen(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "reference_genome": "genome.fa", "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf", "centromere": "centromere_hg19.bed" } if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): print "found", a['entity.name'] if a['entity.name'] in data_mapping.values( ) or a['entity.name'].replace(".gz", "") in data_mapping.values(): print "loading" ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga") tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan') if not isinstance(ent['state'], basestring) and isnan(ent['state']): gnos_endpoint = urlparse( ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow, inputs=dm, parameters={ 'normal_bam_download': { "uuid": ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'tumor_bam_download': { "uuid": ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'broad_variant_pipeline': { "broad_ref_dir": "/tool_data/files/refdata", "sample_id": ent['meta']['Submitter_donor_ID'] } }, tags=["donor:%s" % (ent['meta']['Submitter_donor_ID'])]) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id) if os.path.exists(state_file): os.unlink(state_file) print "Tasks Created: %s" % (len(tasks)) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=args.sudo, tool_data=os.path.abspath("tool_data"), tool_dir=os.path.abspath("tools"), work_dir=args.work_dir, smp=[["MuSE", 8], ["pindel", 8], ["muTect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 24], ["bwa_mem", 12], ["broad_variant_pipline", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def testRunSimple(self): input = { "input_file_1" : Target("c39ded10-6073-11e4-9803-0800200c9a66"), "input_file_2" : Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe") } parameters = { "tail_select" : { "lineNum" : 3 } } bad_parameters = dict(parameters) del bad_parameters['tail_select'] doc = FileDocStore(file_path="./test_tmp/docstore") logging.info("Adding files to object store") sync_doc_dir("examples/simple_galaxy/", doc, uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"] ) logging.info("Creating Task") workflow = GalaxyWorkflow(ga_file="examples/simple_galaxy/SimpleWorkflow.ga") task = nebula.tasks.GalaxyWorkflowTask( "test_workflow", workflow, inputs=input, parameters=parameters ) task_data = task.to_dict() #make sure the task data can be serialized task_data_str = json.dumps(task_data) service = GalaxyService( docstore=doc, name="nosetest_galaxy", galaxy="bgruening/galaxy-stable", force=True, port=20022 ) self.service = service #make sure the generated task is serializable new_task_data = json.loads(task_data_str) new_task = nebula.tasks.from_dict(new_task_data) logging.info("Starting Service") print "Starting service" service.start() self.assertFalse( service.in_error() ) logging.info("Starting Tasks") job = service.submit(new_task) self.assertTrue( isinstance(job, TaskJob) ) self.assertFalse( service.in_error() ) #logging.info("Waiting") service.wait([job]) self.assertIn(job.get_status(), ['ok']) bad_task = nebula.tasks.GalaxyWorkflowTask( "test_workflow_bad", workflow, inputs=input, parameters=bad_parameters ) job = service.submit(bad_task) service.wait([job]) self.assertIn(job.get_status(), ['error']) self.assertFalse( service.in_error() )