def action_cp(args): rg = nebula.warpdrive.RemoteGalaxy(args.url, args.api_key) if not args.dir: docstore = from_url(args.dst) else: if not os.path.exists(args.dst): os.mkdir(args.dst) for hda in rg.get_history_contents(args.src): if hda['visible']: if args.filter is None or re.search(args.filter, hda['name']): if hda['name'] not in args.exclude: print hda['name'] meta = rg.get_dataset(hda['id'], 'hda') if args.dir: dst_path = os.path.join(args.dst, hda['name']) rg.download(meta['download_url'], dst_path) else: meta['id'] = meta['uuid'] #use the glocal id hda = Target(uuid=meta['uuid']) docstore.create(hda) path = docstore.get_filename(hda) rg.download(meta['download_url'], path) docstore.update_from_file(hda)
def run_audit(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) donor_map = {} for id, ent in docstore.filter(state="ok"): if ent['visible']: if docstore.size(Target(id)) > 0: donor = None for i in ent['tags']: t = i.split(":") if t[0] == "donor": donor = t[1] if donor not in donor_map: donor_map[donor] = {} donor_map[donor][ent['name']] = id for ent in synqueue.listAssignments(syn, list_all=True, **config): if ent['meta']['Submitter_donor_ID'] in donor_map: print ent['meta']['Submitter_donor_ID'], len( donor_map[ent['meta']['Submitter_donor_ID']])
def run_upload(args): # syn = synapseclient.Synapse() # syn.login() docstore = from_url(args.out_base) donor_map = {} bam_map = {} for id, doc in docstore.filter(visible=True, state="ok", name=["OUTPUT_BAM_1", "OUTPUT_BAM_2", "OUTPUT_BAM_3"]): print doc["name"], doc["tags"] for t in doc["tags"]: ts = t.split(":") if ts[0] == "donor": if ts[1] not in donor_map: donor_map[ts[1]] = [] donor_map[ts[1]].append(id) if ts[0] == "original_bam": bam_map[ts[1]] = id if not os.path.exists(args.out): os.mkdir(args.out) for key, value in bam_map.items(): t = Target(uuid=value) path = docstore.get_filename(t) print "%s\t%s" % (value, path) os.symlink(path, os.path.join(args.out, "MC3." + key + ".bam"))
def run_upload(args): #syn = synapseclient.Synapse() #syn.login() docstore = from_url(args.out_base) donor_map = {} bam_map = {} for id, doc in docstore.filter( visible=True, state='ok', name=['OUTPUT_BAM_1', 'OUTPUT_BAM_2', 'OUTPUT_BAM_3']): print doc['name'], doc['tags'] for t in doc['tags']: ts = t.split(":") if ts[0] == 'donor': if ts[1] not in donor_map: donor_map[ts[1]] = [] donor_map[ts[1]].append(id) if ts[0] == 'original_bam': bam_map[ts[1]] = id if not os.path.exists(args.out): os.mkdir(args.out) for key, value in bam_map.items(): t = Target(uuid=value) path = docstore.get_filename(t) print "%s\t%s" % (value, path) os.symlink(path, os.path.join(args.out, "MC3." + key + ".bam"))
def run_extract(args): docstore = from_url(args.out_base) for id, ent in docstore.filter( file_ext="vcf", name=[ "muse.vcf", "pindel.vcf", "radia.dna-rna.vcf", "radia.dna.vcf", "somatic_sniper.vcf", "varscan.indel.vcf", "varscan.snp.vcf", "mutect.vcf", ], ): t = Target(uuid=ent["id"]) if docstore.size(t) > 0: donor = None for e in ent["tags"]: tmp = e.split(":") if tmp[0] == "donor": donor = tmp[1] if donor is not None: donor_dir = os.path.join(args.out_dir, donor) if not os.path.exists(donor_dir): os.makedirs(donor_dir) print "Found", donor, ent["name"] shutil.copy(docstore.get_filename(t), os.path.join(donor_dir, ent["name"]))
def run_errors(args): doc = from_url(args.out_base) for id, entry in doc.filter(): if entry.get('state', '') == 'error': if args.within is None or 'update_time' not in entry or check_within(entry['update_time'], args.within): print "Dataset", id, entry.get("job", {}).get("tool_id", ""), entry.get('update_time', ''), entry.get("tags", "") if args.full: if 'provenance' in entry: print "tool:", entry['provenance']['tool_id'] print "-=-=-=-=-=-=-" print entry['job']['stdout'] print "-------------" print entry['job']['stderr'] print "-=-=-=-=-=-=-"
def run_errors(args): doc = from_url(args.out_base) for id, entry in doc.filter(): if entry.get("state", "") == "error": if args.within is None or "update_time" not in entry or check_within(entry["update_time"], args.within): print "Dataset", id, entry.get("job", {}).get("tool_id", ""), entry.get("update_time", ""), entry.get( "tags", "" ) if args.full: if "provenance" in entry: print "tool:", entry["provenance"]["tool_id"] print "-=-=-=-=-=-=-" print entry["job"]["stdout"] print "-------------" print entry["job"]["stderr"] print "-=-=-=-=-=-=-"
def run_extract(args): docstore = from_url(args.out_base) for id, ent in docstore.filter(file_ext="vcf", name=[ "muse.vcf", "pindel.vcf", "radia.dna-rna.vcf", "radia.dna.vcf", "somatic_sniper.vcf", "varscan.indel.vcf", "varscan.snp.vcf", "mutect.vcf" ]): t = Target(uuid=ent['id']) if docstore.size(t) > 0: donor = None for e in ent['tags']: tmp = e.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None: donor_dir = os.path.join(args.out_dir, donor) if not os.path.exists(donor_dir): os.makedirs(donor_dir) print "Found", donor, ent['name'] shutil.copy( docstore.get_filename(t), os.path.join(donor_dir, ent['name']) )
def run_upload(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) donor_map = {} bam_map = {} for id, doc in docstore.filter(visible=True, state='ok', name=['OUTPUT_BAM_1', 'OUTPUT_BAM_2', 'OUTPUT_BAM_3']): print doc['name'], doc['tags'] for t in doc['tags']: ts = t.split(":") if ts[0] == 'donor': if ts[1] not in donor_map: donor_map[ts[1]] = [] donor_map[ts[1]].append(id) if ts[0] == 'original_bam': bam_map[ts[1]] = id print donor_map print bam_map
def run_audit(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) donor_map = {} for id, ent in docstore.filter( state="ok" ): if ent['visible']: if docstore.size(Target(id)) > 0: donor = None for i in ent['tags']: t = i.split(":") if t[0] == "donor": donor = t[1] if donor not in donor_map: donor_map[donor] = {} donor_map[donor][ent['name']] = id for ent in synqueue.listAssignments(syn, list_all=True, **config): if ent['meta']['Submitter_donor_ID'] in donor_map: print ent['meta']['Submitter_donor_ID'], len(donor_map[ent['meta']['Submitter_donor_ID']])
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp": "dbsnp_132_b37.leftAligned.vcf", "centromere": "centromere_hg19.bed", "cosmic": "b37_cosmic_v54_120711.vcf", } ref_genomes = [ "Homo_sapiens_assembly19.fasta", "GRCh37-lite.fa", "GRCh37-lite-+-HPV_Redux-build.fa", "GRCh37-lite_WUGSC_variant_1.fa.gz", "GRCh37-lite_WUGSC_variant_2.fa.gz", "hg19_M_rCRS.fa.gz", ] if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values() + ref_genomes) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} mc3_dna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA.ga") mc3_dnarna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA_RNA.ga") rna_hit = None for a in docstore.filter(name="hg19_M_rCRS.fa"): rna_hit = a[0] tasks = TaskGroup() assembly_hits = {} with open(args.joblist) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: if row["normal_assembly"] != row["tumor_assembly"]: print "Row Mispatch", row["normal_assembly"], row["tumor_assembly"] # raise Exception("Mismatch reference") ref_name = row["normal_assembly"] if ref_name in ref_rename: ref_name = ref_rename[ref_name] if ref_name in assembly_hits: hit = assembly_hits[ref_name] else: hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) assembly_hits[ref_name] = hit workflow_dm = dict(dm) workflow_dm["reference_genome"] = {"uuid": hit} params = { "tumor_bam": { "uuid": row["tumor_analysis_id"], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key", }, "normal_bam": { "uuid": row["normal_analysis_id"], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key", }, "reheader_config": { "platform": "Illumina", "center": "OHSU", "reference_genome": ref_name, "participant_uuid": row["participant_id"], "disease_code": row["disease"], "filedate": datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid": row["normal_analysis_id"], "normal_bam_name": row["normal_filename"], "normal_aliquot_uuid": row["normal_aliquot_id"], "normal_aliquot_barcode": row["normal_barcode"], "tumor_analysis_uuid": row["tumor_analysis_id"], "tumor_bam_name": row["tumor_filename"], "tumor_aliquot_uuid": row["tumor_aliquot_id"], "tumor_aliquot_barcode": row["tumor_barcode"], }, } if row["rna_analysis_id"] != "NA": params["rna_tumor_bam"] = { "uuid": row["rna_analysis_id"], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key", } workflow_dm["rna_reference_genome"] = {"uuid": rna_hit} task = GalaxyWorkflowTask( "workflow_%s" % (row["job_id"]), mc3_dnarna_workflow, inputs=workflow_dm, parameters=params, tags=["donor:%s" % (row["participant_id"])], ) else: task = GalaxyWorkflowTask( "workflow_%s" % (row["job_id"]), mc3_dna_workflow, inputs=workflow_dm, parameters=params, tags=["donor:%s" % (row["participant_id"])], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) service = GalaxyService( docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[ ["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ["radia_filter", 8], ], ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_uploadprep(args): if not os.path.exists(args.workdir): os.mkdir(args.workdir) doc = from_url(args.out_base) file_map = { 'broad' : {}, 'muse' : {} } syn = synapseclient.Synapse() syn.login() wl_map = {} job_map = {} for ent in synqueue.listAssignments(syn, list_all=True, **config): wl_map[ent['id']] = ent['meta'] for id, entry in doc.filter(): donor = None if 'tags' in entry: for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None: if donor not in job_map: job_map[donor] = {} if 'job' in entry and 'job_metrics' in entry['job']: print entry['name'] for met in entry['job']['job_metrics']: if met['name'] == 'runtime_seconds': job_map[donor][entry['name']] = {"tool_id" : entry['job']['tool_id'], "runtime_seconds" : met['raw_value']} if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]: pipeline = None method = None call_type = None variant_type = None if entry['name'].split('.')[0] in ['MUSE_1']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (donor, method, datestr, variant_type, call_type ) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(args.workdir, file_name) shutil.copy(src_file, dst_file) if entry['extension'] == 'vcf': subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True ) dst_file = dst_file + ".gz" subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True) shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file)) subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True) subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True) if donor not in file_map[pipeline]: file_map[pipeline][donor] = [] input_file = os.path.basename(dst_file) file_map[pipeline][donor].append(input_file) for pipeline, donors in file_map.items(): for donor, files in donors.items(): if donor in wl_map: """ with open( os.path.join(args.workdir, "%s.%s.pipeline.json" %(pipeline, donor)), "w" ) as handle: handle.write(json.dumps( {"pipeline_src" : args.pipeline_src, "pipeline_version" : args.pipeline_version} )) """ timing_json = os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor)) with open( timing_json, "w" ) as handle: handle.write(json.dumps( job_map[donor] ) ) with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']), "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']) ] cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl" cmd_str += " --metadata-urls %s" % (",".join(urls)) cmd_str += " --vcfs %s " % (",".join(files)) cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) ))) cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) ))) cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) ))) cmd_str += " --outdir %s.%s.dir" % (pipeline, donor) cmd_str += " --key %s " % (args.keyfile) cmd_str += " --upload-url %s" % (args.upload_url) cmd_str += " --study-refname-override tcga_pancancer_vcf_test" cmd_str += " --workflow-src-url '%s'" % args.pipeline_src cmd_str += " --timing-metrics-json %s" % (timing_json) handle.write("#!/bin/bash\n%s\n" % (cmd_str) )
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--out-base", default="pcawg") parser.add_argument("--ref-download", action="store_true", default=False) parser.add_argument("--create-service", action="store_true", default=False) parser.add_argument("--pilot", action="store_true", default=False) parser.add_argument("--scratch", default=None) args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "")
def run_gen(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "reference_genome" : "genome.fa", "dbsnp" : "dbsnp_132_b37.leftAligned.vcf", "cosmic" : "b37_cosmic_v54_120711.vcf", "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf", "centromere" : "centromere_hg19.bed" } if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): print "found", a['entity.name'] if a['entity.name'] in data_mapping.values() or a['entity.name'].replace(".gz", "") in data_mapping.values(): print "loading" ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) dm = {} for k,v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = { "uuid" : hit } workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga") tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan') if not isinstance(ent['state'], basestring) and isnan(ent['state']): gnos_endpoint = urlparse(ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc task = GalaxyWorkflowTask("workflow_%s" % (ent['id']), workflow, inputs=dm, parameters={ 'normal_bam_download' : { "uuid" : ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'], "gnos_endpoint" : gnos_endpoint, "cred_file" : key_map[gnos_endpoint] }, 'tumor_bam_download' : { "uuid" : ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'], "gnos_endpoint" : gnos_endpoint, "cred_file" : key_map[gnos_endpoint] }, 'broad_variant_pipeline' : { "broad_ref_dir" : "/tool_data/files/refdata", "sample_id" : ent['meta']['Submitter_donor_ID'] } }, tags=[ "donor:%s" % (ent['meta']['Submitter_donor_ID']) ] ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id) if os.path.exists( state_file ): os.unlink( state_file ) print "Tasks Created: %s" % (len(tasks)) if args.create_service: service = GalaxyService( docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=args.sudo, tool_data=os.path.abspath("tool_data"), tool_dir=os.path.abspath("tools"), work_dir=args.work_dir, smp=[ ["MuSE", 8], ["pindel", 8], ["muTect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 24], ["bwa_mem", 12], ["broad_variant_pipline", 24] ] ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp" : "dbsnp_132_b37.leftAligned.vcf", "centromere" : "centromere_hg19.bed", "cosmic" : "b37_cosmic_v54_120711.vcf" } ref_genomes = [ "Homo_sapiens_assembly19.fasta", "GRCh37-lite.fa", "GRCh37-lite-+-HPV_Redux-build.fa", "GRCh37-lite_WUGSC_variant_1.fa.gz", "GRCh37-lite_WUGSC_variant_2.fa.gz", "hg19_M_rCRS.fa.gz" ] if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values() + ref_genomes) dm = {} for k,v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = { "uuid" : hit } mc3_dna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA.ga") mc3_dnarna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA_RNA.ga") rna_hit = None for a in docstore.filter(name="hg19_M_rCRS.fa"): rna_hit = a[0] tasks = TaskGroup() assembly_hits = {} with open(args.joblist) as handle: reader = csv.DictReader(handle, delimiter="\t") for row in reader: if row['normal_assembly'] != row['tumor_assembly']: print "Row Mispatch", row['normal_assembly'], row['tumor_assembly'] #raise Exception("Mismatch reference") ref_name = row['normal_assembly'] if ref_name in ref_rename: ref_name = ref_rename[ref_name] if ref_name in assembly_hits: hit = assembly_hits[ref_name] else: hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) assembly_hits[ref_name] = hit workflow_dm = dict(dm) workflow_dm['reference_genome'] = { "uuid" : hit } params = { 'tumor_bam' : { "uuid" : row['tumor_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, 'normal_bam' : { "uuid" : row['normal_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, "reheader_config" : { "platform" : "Illumina", "center" : "OHSU", "reference_genome" : ref_name, "participant_uuid" : row['participant_id'], "disease_code" : row['disease'], "filedate" : datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid" : row['normal_analysis_id'], "normal_bam_name" : row['normal_filename'], "normal_aliquot_uuid" : row['normal_aliquot_id'], "normal_aliquot_barcode": row['normal_barcode'], "tumor_analysis_uuid" : row['tumor_analysis_id'], "tumor_bam_name" : row['tumor_filename'], "tumor_aliquot_uuid" : row['tumor_aliquot_id'], "tumor_aliquot_barcode" : row['tumor_barcode'], } } if row['rna_analysis_id'] != "NA": params['rna_tumor_bam'] = { "uuid" : row['rna_analysis_id'], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" } workflow_dm['rna_reference_genome'] = { "uuid" : rna_hit } task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']), mc3_dnarna_workflow, inputs=workflow_dm, parameters=params, tags=[ "donor:%s" % (row['participant_id']) ], ) else: task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']), mc3_dna_workflow, inputs=workflow_dm, parameters=params, tags=[ "donor:%s" % (row['participant_id']) ], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) service = GalaxyService( docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[ ["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ['radia_filter', 8] ] ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): syn = synapseclient.Synapse() syn.login() if args.alt_table is not None: config['table_id'] = args.alt_table docstore = from_url(args.out_base) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) data_mapping = { "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf" } dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow_2 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga") workflow_3 = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga") ref_rename = {"HG19_Broad_variant": "Homo_sapiens_assembly19"} tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring)) ref_set = set(a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring)) assert (len(ref_set) == 1) ref_name = ref_set.pop() if ref_name in ref_rename: ref_name = ref_rename[ref_name] hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) workflow_dm = dict(dm) workflow_dm['reference_genome'] = {"uuid": hit} if len(bam_set) == 2: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_2, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] } }) tasks.append(task) elif len(bam_set) == 3: task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow_3, inputs=workflow_dm, parameters={ 'INPUT_BAM_1': { "uuid": bam_set[0], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_2': { "uuid": bam_set[1], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" }, 'INPUT_BAM_3': { "uuid": bam_set[2], "gnos_endpoint": "cghub.ucsc.edu", "cred_file": "/tool_data/files/cghub.key" } }, tags=["donor:%s" % (ent['meta']['participant_id'])], tool_tags={ "BQSR_1": { "output_bam": ["original_bam:%s" % (bam_set[0])] }, "BQSR_2": { "output_bam": ["original_bam:%s" % (bam_set[1])] }, "BQSR_3": { "output_bam": ["original_bam:%s" % (bam_set[2])] } }) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=True, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[["gatk_bqsr", 12], ["gatk_indel", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
REFDATA_PROJECT="syn3241088" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("docstore", help="DocStore") parser.add_argument("--ref-download", action="store_true", default=False) parser.add_argument("--create-service", action="store_true", default=False) args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.docstore) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "")
def run_gen(args): syn = synapseclient.Synapse() syn.login() if args.alt_table is not None: config['table_id'] = args.alt_table docstore = from_url(args.out_base) if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) data_mapping = { "dbsnp" : "dbsnp_132_b37.leftAligned.vcf", "cosmic" : "b37_cosmic_v54_120711.vcf", "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf" } dm = {} for k,v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = { "uuid" : hit } workflow_2 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga") workflow_3 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga") ref_rename = { "HG19_Broad_variant" : "Homo_sapiens_assembly19" } tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring) ) ref_set = set( a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring) ) assert(len(ref_set) == 1) ref_name = ref_set.pop() if ref_name in ref_rename: ref_name = ref_rename[ref_name] hit = None for a in docstore.filter(name=ref_name + ".fasta"): hit = a[0] for a in docstore.filter(name=ref_name + ".fa"): hit = a[0] if hit is None: raise Exception("%s not found" % (ref_name)) workflow_dm = dict(dm) workflow_dm['reference_genome'] = { "uuid" : hit } if len(bam_set) == 2: task = GalaxyWorkflowTask("workflow_%s" % (ent['id']), workflow_2, inputs=workflow_dm, parameters={ 'INPUT_BAM_1' : { "uuid" : bam_set[0], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, 'INPUT_BAM_2' : { "uuid" : bam_set[1], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" } }, tags=[ "donor:%s" % (ent['meta']['participant_id']) ], tool_tags = { "BQSR_1" : { "output_bam" : [ "original_bam:%s" % (bam_set[0]) ] }, "BQSR_2" : { "output_bam" : [ "original_bam:%s" % (bam_set[1]) ] } } ) tasks.append(task) elif len(bam_set) == 3: task = GalaxyWorkflowTask("workflow_%s" % (ent['id']), workflow_3, inputs=workflow_dm, parameters={ 'INPUT_BAM_1' : { "uuid" : bam_set[0], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, 'INPUT_BAM_2' : { "uuid" : bam_set[1], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" }, 'INPUT_BAM_3' : { "uuid" : bam_set[2], "gnos_endpoint" : "cghub.ucsc.edu", "cred_file" : "/tool_data/files/cghub.key" } }, tags=[ "donor:%s" % (ent['meta']['participant_id']) ], tool_tags = { "BQSR_1" : { "output_bam" : [ "original_bam:%s" % (bam_set[0]) ] }, "BQSR_2" : { "output_bam" : [ "original_bam:%s" % (bam_set[1]) ] }, "BQSR_3" : { "output_bam" : [ "original_bam:%s" % (bam_set[2]) ] } } ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService( docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=True, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[ ["gatk_bqsr", 12], ["gatk_indel", 24] ] ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_uploadprep(args): if not os.path.exists(args.workdir): os.mkdir(args.workdir) doc = from_url(args.out_base) file_map = { 'broad' : {}, 'muse' : {}, 'broad_tar' : {} } syn = synapseclient.Synapse() syn.login() wl_map = {} job_map = {} for ent in synqueue.listAssignments(syn, list_all=True, **config): wl_map[ent['id']] = ent['meta'] uuid_map = {} uuid_map['broad'] = synqueue.getValues(syn, "Broad_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['muse'] = synqueue.getValues(syn, "Muse_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['broad_tar'] = synqueue.getValues(syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config) #scan through all of the docs for id, entry in doc.filter(): donor = None #look for docs with donor tags if 'tags' in entry and 'state' in entry and entry['state'] == 'ok': for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None and donor in wl_map: if donor not in job_map: job_map[donor] = {} #scan out the job metrics for this job if 'job' in entry and 'job_metrics' in entry['job']: job_id = entry['job']['id'] tool_id = entry['job']['tool_id'] job_info = { tool_id : {} } for met in entry['job']['job_metrics']: job_info[tool_id][met['name']] = met['raw_value'] job_map[donor][job_id] = job_info donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'] #look for the vcf output files if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]: pipeline = None method = None call_type = None variant_type = None #fill out the info depending on which caller created the file if entry['name'].split('.')[0] in ['MUSE_1']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr, variant_type, call_type ) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(args.workdir, file_name) shutil.copy(src_file, dst_file) #if the files wasn't compressed already, go ahead and do that if entry['extension'] == 'vcf': subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True ) dst_file = dst_file + ".gz" #add file to output map if donor not in file_map[pipeline]: file_map[pipeline][donor] = [] input_file = os.path.basename(dst_file) file_map[pipeline][donor].append(input_file) else: if entry['name'] == "broad.tar.gz": target = Target(uuid=entry['uuid']) src_file = doc.get_filename(target) file_map['broad_tar'][donor] = [ src_file ] timing_map = {} for donor in job_map: timing_map[donor] = {} for job_id in job_map[donor]: for tool_id in job_map[donor][job_id]: if tool_id not in timing_map[donor]: timing_map[donor][tool_id] = [] timing_map[donor][tool_id].append( job_map[donor][job_id][tool_id] ) result_counts = {} for pipeline, donors in file_map.items(): for donor in donors: result_counts[donor] = result_counts.get(donor, 0) + 1 #go through every pipeline for pipeline, donors in file_map.items(): #for that pipeline go through every donor for donor, files in donors.items(): #we're only outputing data for donors on the work list if donor in wl_map and result_counts[donor] == 3: #output the timing json timing_json = os.path.abspath(os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor))) with open( timing_json, "w" ) as handle: handle.write(json.dumps( timing_map[donor] ) ) #output the uploader script with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']), "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']) ] donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'] if pipeline in ['broad', 'muse']: prep_cmd_str = "" for vcf in files: prep_cmd_str += "tabix -p vcf %s\n" % (vcf) prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf,vcf) prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % (vcf, vcf) prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % (vcf, vcf) related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % (",".join(urls)) submit_cmd_str += " --vcfs %s " % (",".join(files)) submit_cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) ))) submit_cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) ))) submit_cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) ))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % (args.upload_url) submit_cmd_str += " --study-refname-override %s" % (args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --vm-location-code %s" % args.vm_location_code submit_cmd_str += " --timing-metrics-json %s" % (timing_json) submit_cmd_str += " --workflow-file-subset %s" % (pipeline) submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids)) submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" if pipeline in ['broad_tar']: prep_cmd_str = "" new_files = [] for tar in files: basename = donor_tumor + ".broad.intermediate" prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % ( os.path.dirname(os.path.abspath(__file__)), tar, "./", basename, donor, donor_tumor ) new_files.append( basename + ".tar" ) related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % (",".join(urls)) submit_cmd_str += " --tarballs %s " % (",".join(new_files)) submit_cmd_str += " --tarball-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in new_files) ))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % (args.upload_url) submit_cmd_str += " --study-refname-override %s" % (args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --workflow-file-subset %s" % (pipeline) submit_cmd_str += " --timing-metrics-json %s" % (timing_json) submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids)) submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" handle.write(string.Template("""#!/bin/bash set -ex ${PREP} ${SUBMIT} echo $$? > $$0.submitted #pushd ${SUBMIT_DIR} #gtupload -v -c ${KEY} -u ./manifest.xml #ECODE=$$? #popd #echo $$ECODE > $$0.uploaded """).substitute(PREP=prep_cmd_str, SUBMIT=submit_cmd_str, SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf", pipeline + "." + donor_tumor + ".dir", uuid_map[pipeline][donor] ), KEY=args.keyfile ) )
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp" : "dbsnp_132_b37.leftAligned.vcf", "centromere" : "centromere_hg19.bed", "reference_genome" : "Homo_sapiens_assembly19.fasta", "cosmic" : "b37_cosmic_v54_120711.vcf" } if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values()) dm = {} for k,v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = { "uuid" : hit } if args.sample is not None: sync_doc_dir( os.path.join( os.path.dirname(__file__), "..", "testexomes" ), docstore, filter=lambda x: x['donorId'] in args.sample ) else: sync_doc_dir( os.path.join( os.path.dirname(__file__), "..", "testexomes" ), docstore) tumor_uuids = {} normal_uuids = {} for id, ent in docstore.filter(sampleType="tumour"): tumor_uuids[ent['participant_id']] = id for id, ent in docstore.filter(sampleType="normal"): normal_uuids[ent['participant_id']] = id mc3_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_Test.ga") reference_id = None for a in docstore.filter(name="Homo_sapiens_assembly19.fasta"): reference_id = a[0] tasks = TaskGroup() for donor in tumor_uuids: if donor in normal_uuids: print "participant", donor donor_name = None for k,v in fake_metadata.items(): if v['participant_id'] == donor: donor_name = k workflow_dm = dict(dm) workflow_dm['tumor_bam'] = { "uuid" : tumor_uuids[donor] } workflow_dm['normal_bam'] = { "uuid" : normal_uuids[donor] } task = GalaxyWorkflowTask("workflow_%s" % (donor), mc3_workflow, inputs=workflow_dm, parameters={ "reheader_config" : { "platform" : "Illumina", "center" : "OHSU", "reference_genome" : "Homo_sapiens_assembly19.fasta", "participant_uuid" : fake_metadata[donor_name]['participant_id'], "disease_code" : fake_metadata[donor_name]['disease'], "filedate" : datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid" : fake_metadata[donor_name]['normal']['uuid'], "normal_bam_name" : fake_metadata[donor_name]['normal']['file_name'], "normal_aliquot_uuid" : fake_metadata[donor_name]['normal']['aliquot_id'], "normal_aliquot_barcode": fake_metadata[donor_name]['normal']['barcode'], "tumor_analysis_uuid" : fake_metadata[donor_name]['tumour']['uuid'], "tumor_bam_name" : fake_metadata[donor_name]['tumour']['file_name'], "tumor_aliquot_uuid" : fake_metadata[donor_name]['tumour']['aliquot_id'], "tumor_aliquot_barcode" : fake_metadata[donor_name]['tumour']['barcode'], } }, tags=[ "donor:%s" % (donor) ], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService( docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[ ["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ['radia_filter', 8] ] ) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_uploadprep(args): if not os.path.exists(args.workdir): os.mkdir(args.workdir) doc = from_url(args.out_base) file_map = {'broad': {}, 'muse': {}, 'broad_tar': {}} syn = synapseclient.Synapse() syn.login() wl_map = {} job_map = {} for ent in synqueue.listAssignments(syn, list_all=True, **config): wl_map[ent['id']] = ent['meta'] uuid_map = {} uuid_map['broad'] = synqueue.getValues(syn, "Broad_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['muse'] = synqueue.getValues(syn, "Muse_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config) uuid_map['broad_tar'] = synqueue.getValues( syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config) #scan through all of the docs for id, entry in doc.filter(): donor = None #look for docs with donor tags if 'tags' in entry and 'state' in entry and entry['state'] == 'ok': for s in entry['tags']: tmp = s.split(":") if tmp[0] == 'donor': donor = tmp[1] if donor is not None and donor in wl_map: if donor not in job_map: job_map[donor] = {} #scan out the job metrics for this job if 'job' in entry and 'job_metrics' in entry['job']: job_id = entry['job']['id'] tool_id = entry['job']['tool_id'] job_info = {tool_id: {}} for met in entry['job']['job_metrics']: job_info[tool_id][met['name']] = met['raw_value'] job_map[donor][job_id] = job_info donor_tumor = wl_map[donor][ 'Tumour_WGS_alignment_GNOS_analysis_IDs'] #look for the vcf output files if entry.get('visible', False) and entry.get( 'extension', None) in ["vcf", "vcf_bgzip"]: pipeline = None method = None call_type = None variant_type = None #fill out the info depending on which caller created the file if entry['name'].split('.')[0] in ['MUSE_1']: pipeline = "muse" method = entry['name'].replace(".", "-") variant_type = 'somatic' call_type = 'snv_mnv' elif entry['name'].split(".")[0] in [ 'broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]: pipeline = "broad" method = entry['name'].split(".")[0] if 'somatic' in entry['name']: variant_type = 'somatic' elif 'germline' in entry['name']: variant_type = 'germline' else: raise Exception("Unknown variant type") if 'snv_mnv.vcf' in entry['name']: call_type = 'snv_mnv' elif 'sv.vcf' in entry['name']: call_type = 'sv' elif 'indel.vcf' in entry['name']: call_type = 'indel' else: raise Exception("Unknown call type: %s" % (entry['name'])) else: raise Exception("Unknown pipeline %s" % (entry['name'])) datestr = datetime.datetime.now().strftime("%Y%m%d") name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr, variant_type, call_type) name = re.sub(r'.vcf$', '', name) if entry['extension'] == 'vcf': file_name = name + ".vcf" elif entry['extension'] == 'vcf_bgzip': file_name = name + ".vcf.gz" target = Target(uuid=entry['uuid']) if doc.size(target) > 0: src_file = doc.get_filename(target) dst_file = os.path.join(args.workdir, file_name) shutil.copy(src_file, dst_file) #if the files wasn't compressed already, go ahead and do that if entry['extension'] == 'vcf': subprocess.check_call("bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True) dst_file = dst_file + ".gz" #add file to output map if donor not in file_map[pipeline]: file_map[pipeline][donor] = [] input_file = os.path.basename(dst_file) file_map[pipeline][donor].append(input_file) else: if entry['name'] == "broad.tar.gz": target = Target(uuid=entry['uuid']) src_file = doc.get_filename(target) file_map['broad_tar'][donor] = [src_file] timing_map = {} for donor in job_map: timing_map[donor] = {} for job_id in job_map[donor]: for tool_id in job_map[donor][job_id]: if tool_id not in timing_map[donor]: timing_map[donor][tool_id] = [] timing_map[donor][tool_id].append( job_map[donor][job_id][tool_id]) result_counts = {} for pipeline, donors in file_map.items(): for donor in donors: result_counts[donor] = result_counts.get(donor, 0) + 1 #go through every pipeline for pipeline, donors in file_map.items(): #for that pipeline go through every donor for donor, files in donors.items(): #we're only outputing data for donors on the work list if donor in wl_map and result_counts[donor] == 3: #output the timing json timing_json = os.path.abspath( os.path.join(args.workdir, "%s.%s.timing.json" % (pipeline, donor))) with open(timing_json, "w") as handle: handle.write(json.dumps(timing_map[donor])) #output the uploader script with open( os.path.join(args.workdir, "%s.%s.sh" % (pipeline, donor)), "w") as handle: input_file = os.path.basename(dst_file) urls = [ "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID'] ), "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor] ['Tumour_WGS_alignment_GNOS_analysis_IDs']) ] donor_tumor = wl_map[donor][ 'Tumour_WGS_alignment_GNOS_analysis_IDs'] if pipeline in ['broad', 'muse']: prep_cmd_str = "" for vcf in files: prep_cmd_str += "tabix -p vcf %s\n" % (vcf) prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf, vcf) prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % ( vcf, vcf) prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % ( vcf, vcf) related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % ( ",".join(urls)) submit_cmd_str += " --vcfs %s " % (",".join(files)) submit_cmd_str += " --vcf-md5sum-files %s " % ( (",".join(("%s.md5" % i for i in files)))) submit_cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files)))) submit_cmd_str += " --vcf-idx-md5sum-files %s" % ( (",".join(("%s.idx.md5" % i for i in files)))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % ( args.upload_url) submit_cmd_str += " --study-refname-override %s" % ( args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --vm-location-code %s" % args.vm_location_code submit_cmd_str += " --timing-metrics-json %s" % ( timing_json) submit_cmd_str += " --workflow-file-subset %s" % ( pipeline) submit_cmd_str += " --related-file-subset-uuids %s" % ( ",".join(related_uuids)) submit_cmd_str += " --uuid %s" % ( uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" if pipeline in ['broad_tar']: prep_cmd_str = "" new_files = [] for tar in files: basename = donor_tumor + ".broad.intermediate" prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % ( os.path.dirname(os.path.abspath(__file__)), tar, "./", basename, donor, donor_tumor) new_files.append(basename + ".tar") related_uuids = [] for p in uuid_map: if p != pipeline: related_uuids.append(uuid_map[p][donor]) submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib" submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl" submit_cmd_str += " --metadata-urls %s" % ( ",".join(urls)) submit_cmd_str += " --tarballs %s " % ( ",".join(new_files)) submit_cmd_str += " --tarball-md5sum-files %s " % ( (",".join(("%s.md5" % i for i in new_files)))) submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor) submit_cmd_str += " --key %s " % (args.keyfile) submit_cmd_str += " --upload-url %s" % ( args.upload_url) submit_cmd_str += " --study-refname-override %s" % ( args.study) submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb submit_cmd_str += " --workflow-file-subset %s" % ( pipeline) submit_cmd_str += " --timing-metrics-json %s" % ( timing_json) submit_cmd_str += " --related-file-subset-uuids %s" % ( ",".join(related_uuids)) submit_cmd_str += " --uuid %s" % ( uuid_map[pipeline][donor]) #submit_cmd_str += " --skip-upload" handle.write( string.Template("""#!/bin/bash set -ex ${PREP} ${SUBMIT} echo $$? > $$0.submitted #pushd ${SUBMIT_DIR} #gtupload -v -c ${KEY} -u ./manifest.xml #ECODE=$$? #popd #echo $$ECODE > $$0.uploaded """).substitute(PREP=prep_cmd_str, SUBMIT=submit_cmd_str, SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf", pipeline + "." + donor_tumor + ".dir", uuid_map[pipeline][donor]), KEY=args.keyfile))
def run_gen(args): syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "db_snp": "dbsnp_132_b37.leftAligned.vcf", "centromere": "centromere_hg19.bed", "reference_genome": "Homo_sapiens_assembly19.fasta", "cosmic": "b37_cosmic_v54_120711.vcf" } if args.ref_download: syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values()) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} if args.sample is not None: sync_doc_dir(os.path.join(os.path.dirname(__file__), "..", "testexomes"), docstore, filter=lambda x: x['donorId'] in args.sample) else: sync_doc_dir( os.path.join(os.path.dirname(__file__), "..", "testexomes"), docstore) tumor_uuids = {} normal_uuids = {} for id, ent in docstore.filter(sampleType="tumour"): tumor_uuids[ent['participant_id']] = id for id, ent in docstore.filter(sampleType="normal"): normal_uuids[ent['participant_id']] = id mc3_workflow = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_Test.ga") reference_id = None for a in docstore.filter(name="Homo_sapiens_assembly19.fasta"): reference_id = a[0] tasks = TaskGroup() for donor in tumor_uuids: if donor in normal_uuids: print "participant", donor donor_name = None for k, v in fake_metadata.items(): if v['participant_id'] == donor: donor_name = k workflow_dm = dict(dm) workflow_dm['tumor_bam'] = {"uuid": tumor_uuids[donor]} workflow_dm['normal_bam'] = {"uuid": normal_uuids[donor]} task = GalaxyWorkflowTask( "workflow_%s" % (donor), mc3_workflow, inputs=workflow_dm, parameters={ "reheader_config": { "platform": "Illumina", "center": "OHSU", "reference_genome": "Homo_sapiens_assembly19.fasta", "participant_uuid": fake_metadata[donor_name]['participant_id'], "disease_code": fake_metadata[donor_name]['disease'], "filedate": datetime.datetime.now().strftime("%Y%m%d"), "normal_analysis_uuid": fake_metadata[donor_name]['normal']['uuid'], "normal_bam_name": fake_metadata[donor_name]['normal']['file_name'], "normal_aliquot_uuid": fake_metadata[donor_name]['normal']['aliquot_id'], "normal_aliquot_barcode": fake_metadata[donor_name]['normal']['barcode'], "tumor_analysis_uuid": fake_metadata[donor_name]['tumour']['uuid'], "tumor_bam_name": fake_metadata[donor_name]['tumour']['file_name'], "tumor_aliquot_uuid": fake_metadata[donor_name]['tumour']['aliquot_id'], "tumor_aliquot_barcode": fake_metadata[donor_name]['tumour']['barcode'], } }, tags=["donor:%s" % (donor)], ) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) if args.create_service: service = GalaxyService(docstore=docstore, galaxy=args.galaxy, sudo=args.sudo, tool_data=args.tool_data, tool_dir=args.tool_dir, work_dir=args.work_dir, smp=[["gatk_bqsr", 12], ["gatk_indel", 24], ["MuSE", 8], ["pindel", 8], ["mutect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 12], ["bwa_mem", 12], ["radia", 8], ['radia_filter', 8]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)
def run_gen(args): args = parser.parse_args() syn = synapseclient.Synapse() syn.login() docstore = from_url(args.out_base) data_mapping = { "reference_genome": "genome.fa", "dbsnp": "dbsnp_132_b37.leftAligned.vcf", "cosmic": "b37_cosmic_v54_120711.vcf", "gold_indels": "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf", "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf", "centromere": "centromere_hg19.bed" } if args.ref_download: #download reference files from Synapse and populate the document store for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)): print "found", a['entity.name'] if a['entity.name'] in data_mapping.values( ) or a['entity.name'].replace(".gz", "") in data_mapping.values(): print "loading" ent = syn.get(a['entity.id']) id = ent.annotations['uuid'][0] t = Target(uuid=id) docstore.create(t) path = docstore.get_filename(t) name = ent.name if 'dataPrep' in ent.annotations: if ent.annotations['dataPrep'][0] == 'gunzip': subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True) name = name.replace(".gz", "") else: print "Unknown DataPrep" else: shutil.copy(ent.path, path) docstore.update_from_file(t) meta = {} meta['name'] = name meta['uuid'] = id if 'dataPrep' in meta: del meta['dataPrep'] docstore.put(id, meta) dm = {} for k, v in data_mapping.items(): hit = None for a in docstore.filter(name=v): hit = a[0] if hit is None: raise Exception("%s not found" % (v)) dm[k] = {"uuid": hit} workflow = GalaxyWorkflow( ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga") tasks = TaskGroup() for ent in synqueue.listAssignments(syn, **config): #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan') if not isinstance(ent['state'], basestring) and isnan(ent['state']): gnos_endpoint = urlparse( ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc task = GalaxyWorkflowTask( "workflow_%s" % (ent['id']), workflow, inputs=dm, parameters={ 'normal_bam_download': { "uuid": ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'tumor_bam_download': { "uuid": ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'], "gnos_endpoint": gnos_endpoint, "cred_file": key_map[gnos_endpoint] }, 'broad_variant_pipeline': { "broad_ref_dir": "/tool_data/files/refdata", "sample_id": ent['meta']['Submitter_donor_ID'] } }, tags=["donor:%s" % (ent['meta']['Submitter_donor_ID'])]) tasks.append(task) if not os.path.exists("%s.tasks" % (args.out_base)): os.mkdir("%s.tasks" % (args.out_base)) for data in tasks: with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle: handle.write(json.dumps(data.to_dict())) state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id) if os.path.exists(state_file): os.unlink(state_file) print "Tasks Created: %s" % (len(tasks)) if args.create_service: service = GalaxyService(docstore=docstore, galaxy="bgruening/galaxy-stable", sudo=args.sudo, tool_data=os.path.abspath("tool_data"), tool_dir=os.path.abspath("tools"), work_dir=args.work_dir, smp=[["MuSE", 8], ["pindel", 8], ["muTect", 8], ["delly", 4], ["gatk_bqsr", 12], ["gatk_indel", 24], ["bwa_mem", 12], ["broad_variant_pipline", 24]]) with open("%s.service" % (args.out_base), "w") as handle: s = service.get_config() if args.scratch: print "Using scratch", args.scratch s.set_docstore_config(cache_path=args.scratch, open_perms=True) s.store(handle)