Exemple #1
0
def action_cp(args):
    rg = nebula.warpdrive.RemoteGalaxy(args.url, args.api_key)
    
    if not args.dir:
        docstore = from_url(args.dst)
    else:
        if not os.path.exists(args.dst):
            os.mkdir(args.dst)

    for hda in rg.get_history_contents(args.src):
        if hda['visible']:
            if args.filter is None or re.search(args.filter, hda['name']):
                if hda['name'] not in args.exclude:
                    print hda['name']
                    meta = rg.get_dataset(hda['id'], 'hda')
                    if args.dir:
                        dst_path = os.path.join(args.dst, hda['name'])
                        rg.download(meta['download_url'], dst_path)
                    else:
                        meta['id'] = meta['uuid'] #use the glocal id
                        hda = Target(uuid=meta['uuid'])
                        docstore.create(hda)
                        path = docstore.get_filename(hda)
                        rg.download(meta['download_url'], path)
                        docstore.update_from_file(hda)
Exemple #2
0
def run_audit(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    for id, ent in docstore.filter(state="ok"):
        if ent['visible']:
            if docstore.size(Target(id)) > 0:
                donor = None
                for i in ent['tags']:
                    t = i.split(":")
                    if t[0] == "donor":
                        donor = t[1]
                if donor not in donor_map:
                    donor_map[donor] = {}
                donor_map[donor][ent['name']] = id

    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        if ent['meta']['Submitter_donor_ID'] in donor_map:
            print ent['meta']['Submitter_donor_ID'], len(
                donor_map[ent['meta']['Submitter_donor_ID']])
Exemple #3
0
def run_upload(args):
    # syn = synapseclient.Synapse()
    # syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    bam_map = {}

    for id, doc in docstore.filter(visible=True, state="ok", name=["OUTPUT_BAM_1", "OUTPUT_BAM_2", "OUTPUT_BAM_3"]):
        print doc["name"], doc["tags"]
        for t in doc["tags"]:
            ts = t.split(":")
            if ts[0] == "donor":
                if ts[1] not in donor_map:
                    donor_map[ts[1]] = []
                donor_map[ts[1]].append(id)
            if ts[0] == "original_bam":
                bam_map[ts[1]] = id

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    for key, value in bam_map.items():
        t = Target(uuid=value)
        path = docstore.get_filename(t)
        print "%s\t%s" % (value, path)
        os.symlink(path, os.path.join(args.out, "MC3." + key + ".bam"))
Exemple #4
0
def action_cp(args):
    rg = nebula.warpdrive.RemoteGalaxy(args.url, args.api_key)

    if not args.dir:
        docstore = from_url(args.dst)
    else:
        if not os.path.exists(args.dst):
            os.mkdir(args.dst)

    for hda in rg.get_history_contents(args.src):
        if hda['visible']:
            if args.filter is None or re.search(args.filter, hda['name']):
                if hda['name'] not in args.exclude:
                    print hda['name']
                    meta = rg.get_dataset(hda['id'], 'hda')
                    if args.dir:
                        dst_path = os.path.join(args.dst, hda['name'])
                        rg.download(meta['download_url'], dst_path)
                    else:
                        meta['id'] = meta['uuid']  #use the glocal id
                        hda = Target(uuid=meta['uuid'])
                        docstore.create(hda)
                        path = docstore.get_filename(hda)
                        rg.download(meta['download_url'], path)
                        docstore.update_from_file(hda)
Exemple #5
0
def run_upload(args):
    #syn = synapseclient.Synapse()
    #syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    bam_map = {}

    for id, doc in docstore.filter(
            visible=True,
            state='ok',
            name=['OUTPUT_BAM_1', 'OUTPUT_BAM_2', 'OUTPUT_BAM_3']):
        print doc['name'], doc['tags']
        for t in doc['tags']:
            ts = t.split(":")
            if ts[0] == 'donor':
                if ts[1] not in donor_map:
                    donor_map[ts[1]] = []
                donor_map[ts[1]].append(id)
            if ts[0] == 'original_bam':
                bam_map[ts[1]] = id

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    for key, value in bam_map.items():
        t = Target(uuid=value)
        path = docstore.get_filename(t)
        print "%s\t%s" % (value, path)
        os.symlink(path, os.path.join(args.out, "MC3." + key + ".bam"))
Exemple #6
0
def run_extract(args):
    docstore = from_url(args.out_base)

    for id, ent in docstore.filter(
        file_ext="vcf",
        name=[
            "muse.vcf",
            "pindel.vcf",
            "radia.dna-rna.vcf",
            "radia.dna.vcf",
            "somatic_sniper.vcf",
            "varscan.indel.vcf",
            "varscan.snp.vcf",
            "mutect.vcf",
        ],
    ):
        t = Target(uuid=ent["id"])
        if docstore.size(t) > 0:
            donor = None
            for e in ent["tags"]:
                tmp = e.split(":")
                if tmp[0] == "donor":
                    donor = tmp[1]
            if donor is not None:
                donor_dir = os.path.join(args.out_dir, donor)
                if not os.path.exists(donor_dir):
                    os.makedirs(donor_dir)
                print "Found", donor, ent["name"]
                shutil.copy(docstore.get_filename(t), os.path.join(donor_dir, ent["name"]))
Exemple #7
0
def run_errors(args):

    doc = from_url(args.out_base)

    for id, entry in doc.filter():
        if entry.get('state', '') == 'error':
            if args.within is None or 'update_time' not in entry or check_within(entry['update_time'], args.within):
                print "Dataset", id, entry.get("job", {}).get("tool_id", ""), entry.get('update_time', ''), entry.get("tags", "")
                if args.full:
                    if 'provenance' in entry:
                        print "tool:", entry['provenance']['tool_id']
                        print "-=-=-=-=-=-=-"
                    print entry['job']['stdout']
                    print "-------------"
                    print entry['job']['stderr']
                    print "-=-=-=-=-=-=-"
Exemple #8
0
def run_errors(args):

    doc = from_url(args.out_base)

    for id, entry in doc.filter():
        if entry.get('state', '') == 'error':
            if args.within is None or 'update_time' not in entry or check_within(entry['update_time'], args.within):
                print "Dataset", id, entry.get("job", {}).get("tool_id", ""), entry.get('update_time', ''), entry.get("tags", "")
                if args.full:
                    if 'provenance' in entry:
                        print "tool:", entry['provenance']['tool_id']
                        print "-=-=-=-=-=-=-"
                    print entry['job']['stdout']
                    print "-------------"
                    print entry['job']['stderr']
                    print "-=-=-=-=-=-=-"
Exemple #9
0
def run_errors(args):

    doc = from_url(args.out_base)

    for id, entry in doc.filter():
        if entry.get("state", "") == "error":
            if args.within is None or "update_time" not in entry or check_within(entry["update_time"], args.within):
                print "Dataset", id, entry.get("job", {}).get("tool_id", ""), entry.get("update_time", ""), entry.get(
                    "tags", ""
                )
                if args.full:
                    if "provenance" in entry:
                        print "tool:", entry["provenance"]["tool_id"]
                        print "-=-=-=-=-=-=-"
                    print entry["job"]["stdout"]
                    print "-------------"
                    print entry["job"]["stderr"]
                    print "-=-=-=-=-=-=-"
Exemple #10
0
def run_extract(args):
    docstore = from_url(args.out_base)
    
    for id, ent in docstore.filter(file_ext="vcf", name=[
        "muse.vcf", "pindel.vcf", "radia.dna-rna.vcf", "radia.dna.vcf", "somatic_sniper.vcf", 
        "varscan.indel.vcf", "varscan.snp.vcf", "mutect.vcf"
    ]):
        t = Target(uuid=ent['id'])
        if docstore.size(t) > 0:
            donor = None
            for e in ent['tags']:
                tmp = e.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
            if donor is not None:
                donor_dir = os.path.join(args.out_dir, donor)
                if not os.path.exists(donor_dir):
                    os.makedirs(donor_dir)
                print "Found", donor, ent['name']
                shutil.copy( docstore.get_filename(t), os.path.join(donor_dir, ent['name']) )
Exemple #11
0
def run_upload(args):
    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    bam_map = {}

    for id, doc in docstore.filter(visible=True, state='ok', name=['OUTPUT_BAM_1', 'OUTPUT_BAM_2', 'OUTPUT_BAM_3']):
        print doc['name'], doc['tags']
        for t in doc['tags']:
            ts = t.split(":")
            if ts[0] == 'donor':
                if ts[1] not in donor_map:
                    donor_map[ts[1]] = []
                donor_map[ts[1]].append(id)
            if ts[0] == 'original_bam':
                bam_map[ts[1]] = id

    print donor_map
    print bam_map
def run_audit(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    for id, ent in docstore.filter( state="ok" ):
        if ent['visible']:
            if docstore.size(Target(id)) > 0:
                donor = None
                for i in ent['tags']:
                    t = i.split(":")
                    if t[0] == "donor":
                        donor = t[1]
                if donor not in donor_map:
                    donor_map[donor] = {}
                donor_map[donor][ent['name']] = id

    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        if ent['meta']['Submitter_donor_ID'] in donor_map:
            print ent['meta']['Submitter_donor_ID'], len(donor_map[ent['meta']['Submitter_donor_ID']])
Exemple #13
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "db_snp": "dbsnp_132_b37.leftAligned.vcf",
        "centromere": "centromere_hg19.bed",
        "cosmic": "b37_cosmic_v54_120711.vcf",
    }

    ref_genomes = [
        "Homo_sapiens_assembly19.fasta",
        "GRCh37-lite.fa",
        "GRCh37-lite-+-HPV_Redux-build.fa",
        "GRCh37-lite_WUGSC_variant_1.fa.gz",
        "GRCh37-lite_WUGSC_variant_2.fa.gz",
        "hg19_M_rCRS.fa.gz",
    ]

    if args.ref_download:
        syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values() + ref_genomes)

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    mc3_dna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA.ga")
    mc3_dnarna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA_RNA.ga")

    rna_hit = None
    for a in docstore.filter(name="hg19_M_rCRS.fa"):
        rna_hit = a[0]

    tasks = TaskGroup()
    assembly_hits = {}
    with open(args.joblist) as handle:
        reader = csv.DictReader(handle, delimiter="\t")
        for row in reader:
            if row["normal_assembly"] != row["tumor_assembly"]:
                print "Row Mispatch", row["normal_assembly"], row["tumor_assembly"]
                # raise Exception("Mismatch reference")
            ref_name = row["normal_assembly"]
            if ref_name in ref_rename:
                ref_name = ref_rename[ref_name]
            if ref_name in assembly_hits:
                hit = assembly_hits[ref_name]
            else:
                hit = None
                for a in docstore.filter(name=ref_name + ".fasta"):
                    hit = a[0]
                for a in docstore.filter(name=ref_name + ".fa"):
                    hit = a[0]
                if hit is None:
                    raise Exception("%s not found" % (ref_name))
                assembly_hits[ref_name] = hit
            workflow_dm = dict(dm)
            workflow_dm["reference_genome"] = {"uuid": hit}

            params = {
                "tumor_bam": {
                    "uuid": row["tumor_analysis_id"],
                    "gnos_endpoint": "cghub.ucsc.edu",
                    "cred_file": "/tool_data/files/cghub.key",
                },
                "normal_bam": {
                    "uuid": row["normal_analysis_id"],
                    "gnos_endpoint": "cghub.ucsc.edu",
                    "cred_file": "/tool_data/files/cghub.key",
                },
                "reheader_config": {
                    "platform": "Illumina",
                    "center": "OHSU",
                    "reference_genome": ref_name,
                    "participant_uuid": row["participant_id"],
                    "disease_code": row["disease"],
                    "filedate": datetime.datetime.now().strftime("%Y%m%d"),
                    "normal_analysis_uuid": row["normal_analysis_id"],
                    "normal_bam_name": row["normal_filename"],
                    "normal_aliquot_uuid": row["normal_aliquot_id"],
                    "normal_aliquot_barcode": row["normal_barcode"],
                    "tumor_analysis_uuid": row["tumor_analysis_id"],
                    "tumor_bam_name": row["tumor_filename"],
                    "tumor_aliquot_uuid": row["tumor_aliquot_id"],
                    "tumor_aliquot_barcode": row["tumor_barcode"],
                },
            }

            if row["rna_analysis_id"] != "NA":
                params["rna_tumor_bam"] = {
                    "uuid": row["rna_analysis_id"],
                    "gnos_endpoint": "cghub.ucsc.edu",
                    "cred_file": "/tool_data/files/cghub.key",
                }
                workflow_dm["rna_reference_genome"] = {"uuid": rna_hit}
                task = GalaxyWorkflowTask(
                    "workflow_%s" % (row["job_id"]),
                    mc3_dnarna_workflow,
                    inputs=workflow_dm,
                    parameters=params,
                    tags=["donor:%s" % (row["participant_id"])],
                )
            else:
                task = GalaxyWorkflowTask(
                    "workflow_%s" % (row["job_id"]),
                    mc3_dna_workflow,
                    inputs=workflow_dm,
                    parameters=params,
                    tags=["donor:%s" % (row["participant_id"])],
                )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    service = GalaxyService(
        docstore=docstore,
        galaxy=args.galaxy,
        sudo=args.sudo,
        tool_data=args.tool_data,
        tool_dir=args.tool_dir,
        work_dir=args.work_dir,
        smp=[
            ["gatk_bqsr", 12],
            ["gatk_indel", 24],
            ["MuSE", 8],
            ["pindel", 8],
            ["mutect", 8],
            ["delly", 4],
            ["gatk_bqsr", 12],
            ["gatk_indel", 12],
            ["bwa_mem", 12],
            ["radia", 8],
            ["radia_filter", 8],
        ],
    )
    with open("%s.service" % (args.out_base), "w") as handle:
        s = service.get_config()
        if args.scratch:
            print "Using scratch", args.scratch
            s.set_docstore_config(cache_path=args.scratch, open_perms=True)
        s.store(handle)
Exemple #14
0
def run_uploadprep(args):
    
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {
        'broad' : {},
        'muse' : {}
    }

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']
    
    for id, entry in doc.filter():
        donor = None    
        if 'tags' in entry:
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None: 
            if donor not in job_map:
                job_map[donor] = {}
            if 'job' in entry and 'job_metrics' in entry['job']:
                print entry['name']
                for met in entry['job']['job_metrics']:
                    if met['name'] == 'runtime_seconds':
                        job_map[donor][entry['name']] = {"tool_id" : entry['job']['tool_id'], "runtime_seconds" : met['raw_value']}
            if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True )
                        dst_file = dst_file + ".gz"

                    subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True)
                    shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file))
                    subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True)
                    subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True)

                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []

                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)

    for pipeline, donors in file_map.items():
        for donor, files in donors.items():
            if donor in wl_map:
                """
                with open( os.path.join(args.workdir, "%s.%s.pipeline.json" %(pipeline, donor)), "w" ) as handle:
                    handle.write(json.dumps( {"pipeline_src" : args.pipeline_src, "pipeline_version" : args.pipeline_version} ))
                """
                timing_json = os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor))
                with open( timing_json, "w" ) as handle:
                    handle.write(json.dumps( job_map[donor] ) )
                    
            
                with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']),
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl"
                    cmd_str += " --metadata-urls %s" % (",".join(urls))
                    cmd_str += " --vcfs %s " % (",".join(files))
                    cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                    cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                    cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                    cmd_str += " --outdir %s.%s.dir" % (pipeline, donor)
                    cmd_str += " --key %s " % (args.keyfile)
                    cmd_str += " --upload-url %s" % (args.upload_url)
                    cmd_str += " --study-refname-override tcga_pancancer_vcf_test"
                    cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                    cmd_str += " --timing-metrics-json %s" % (timing_json)
                    handle.write("#!/bin/bash\n%s\n" % (cmd_str) )

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--out-base", default="pcawg")
    parser.add_argument("--ref-download", action="store_true", default=False)
    parser.add_argument("--create-service", action="store_true", default=False)
    parser.add_argument("--pilot", action="store_true", default=False)
    parser.add_argument("--scratch", default=None)

    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                    name = name.replace(".gz", "")
def run_gen(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "reference_genome" : "genome.fa",
        "dbsnp" : "dbsnp_132_b37.leftAligned.vcf",
        "cosmic" : "b37_cosmic_v54_120711.vcf",
        "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf",
        "centromere" : "centromere_hg19.bed"
    }

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            print "found",  a['entity.name']
            if a['entity.name'] in data_mapping.values() or a['entity.name'].replace(".gz", "") in data_mapping.values():
                print "loading"
                ent = syn.get(a['entity.id'])
                id = ent.annotations['uuid'][0]
                t = Target(uuid=id)
                docstore.create(t)
                path = docstore.get_filename(t)
                name = ent.name
                if 'dataPrep' in ent.annotations:
                    if ent.annotations['dataPrep'][0] == 'gunzip':
                        subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                        name = name.replace(".gz", "")
                    else:
                        print "Unknown DataPrep"
                else:
                    shutil.copy(ent.path, path)
                docstore.update_from_file(t)
                meta = {}
                meta['name'] = name
                meta['uuid'] = id
                if 'dataPrep' in meta:
                    del meta['dataPrep']
                docstore.put(id, meta)

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga")
    tasks = TaskGroup()
    for ent in synqueue.listAssignments(syn, **config):
        #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan')
        if not isinstance(ent['state'], basestring) and isnan(ent['state']):
            gnos_endpoint = urlparse(ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow,
                inputs=dm,
                parameters={
                    'normal_bam_download' : {
                        "uuid" : ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'],
                        "gnos_endpoint" : gnos_endpoint,
                        "cred_file" : key_map[gnos_endpoint]
                    },
                    'tumor_bam_download' : {
                        "uuid" : ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'],
                        "gnos_endpoint" : gnos_endpoint,
                        "cred_file" : key_map[gnos_endpoint]
                    },
                    'broad_variant_pipeline' : {
                        "broad_ref_dir" : "/tool_data/files/refdata",
                        "sample_id" : ent['meta']['Submitter_donor_ID']
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['Submitter_donor_ID']) ]
            )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))

    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))
            state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id)
            if os.path.exists( state_file ):
                os.unlink( state_file )

    print "Tasks Created: %s" % (len(tasks))

    if args.create_service:
        service = GalaxyService(
            docstore=docstore,
            galaxy="bgruening/galaxy-stable",
            sudo=args.sudo,
            tool_data=os.path.abspath("tool_data"),
            tool_dir=os.path.abspath("tools"),
            work_dir=args.work_dir,
            smp=[
                ["MuSE", 8],
                ["pindel", 8],
                ["muTect", 8],
                ["delly", 4],
                ["gatk_bqsr", 12],
                ["gatk_indel", 24],
                ["bwa_mem", 12],
                ["broad_variant_pipline", 24]
            ]
        )
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)
Exemple #17
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "db_snp" : "dbsnp_132_b37.leftAligned.vcf",
        "centromere" : "centromere_hg19.bed",
        "cosmic" : "b37_cosmic_v54_120711.vcf"
    }

    ref_genomes = [
        "Homo_sapiens_assembly19.fasta",
        "GRCh37-lite.fa",
        "GRCh37-lite-+-HPV_Redux-build.fa",
        "GRCh37-lite_WUGSC_variant_1.fa.gz",
        "GRCh37-lite_WUGSC_variant_2.fa.gz",
        "hg19_M_rCRS.fa.gz"
    ]

    if args.ref_download:
        syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values() + ref_genomes)

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    mc3_dna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA.ga")
    mc3_dnarna_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_CGHub_DNA_RNA.ga")

    rna_hit = None
    for a in docstore.filter(name="hg19_M_rCRS.fa"):
        rna_hit = a[0]

    tasks = TaskGroup()
    assembly_hits = {}
    with open(args.joblist) as handle:
        reader = csv.DictReader(handle, delimiter="\t")
        for row in reader:
            if row['normal_assembly'] != row['tumor_assembly']:
                print "Row Mispatch", row['normal_assembly'], row['tumor_assembly']
                #raise Exception("Mismatch reference")
            ref_name = row['normal_assembly']
            if ref_name in ref_rename:
                ref_name = ref_rename[ref_name]
            if ref_name in assembly_hits:
                hit = assembly_hits[ref_name]
            else:
                hit = None
                for a in docstore.filter(name=ref_name + ".fasta"):
                    hit = a[0]
                for a in docstore.filter(name=ref_name + ".fa"):
                    hit = a[0]
                if hit is None:
                    raise Exception("%s not found" % (ref_name))
                assembly_hits[ref_name] = hit
            workflow_dm = dict(dm)
            workflow_dm['reference_genome'] = { "uuid" : hit }
            
            params = {
                'tumor_bam' : {
                    "uuid" : row['tumor_analysis_id'],
                    "gnos_endpoint" : "cghub.ucsc.edu",
                    "cred_file" : "/tool_data/files/cghub.key"
                },
                'normal_bam' : {
                    "uuid" : row['normal_analysis_id'],
                    "gnos_endpoint" : "cghub.ucsc.edu",
                    "cred_file" : "/tool_data/files/cghub.key"
                },
                "reheader_config" : {
                    "platform" : "Illumina",
                    "center" : "OHSU",
                    "reference_genome" : ref_name,
                    "participant_uuid" : row['participant_id'],
                    "disease_code" : row['disease'],
                    "filedate" : datetime.datetime.now().strftime("%Y%m%d"),
                    "normal_analysis_uuid" : row['normal_analysis_id'],
                    "normal_bam_name" : row['normal_filename'],
                    "normal_aliquot_uuid" : row['normal_aliquot_id'],
                    "normal_aliquot_barcode": row['normal_barcode'],
                    "tumor_analysis_uuid" : row['tumor_analysis_id'],
                    "tumor_bam_name" : row['tumor_filename'],
                    "tumor_aliquot_uuid" : row['tumor_aliquot_id'],
                    "tumor_aliquot_barcode" : row['tumor_barcode'],
                }
            }
            
            if row['rna_analysis_id'] != "NA":
                params['rna_tumor_bam'] = {
                    "uuid" : row['rna_analysis_id'],
                    "gnos_endpoint" : "cghub.ucsc.edu",
                    "cred_file" : "/tool_data/files/cghub.key"
                }
                workflow_dm['rna_reference_genome'] = { "uuid" : rna_hit }
                task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']),
                    mc3_dnarna_workflow,
                    inputs=workflow_dm,
                    parameters=params,
                    tags=[ "donor:%s" % (row['participant_id']) ],
                )            
            else: 
                task = GalaxyWorkflowTask("workflow_%s" % (row['job_id']),
                    mc3_dna_workflow,
                    inputs=workflow_dm,
                    parameters=params,
                    tags=[ "donor:%s" % (row['participant_id']) ],
                )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    service = GalaxyService(
        docstore=docstore,
        galaxy=args.galaxy,
        sudo=args.sudo,
        tool_data=args.tool_data,
        tool_dir=args.tool_dir,
        work_dir=args.work_dir,
        smp=[
            ["gatk_bqsr", 12],
            ["gatk_indel", 24],
            ["MuSE", 8],
            ["pindel", 8],
            ["mutect", 8],
            ["delly", 4],
            ["gatk_bqsr", 12],
            ["gatk_indel", 12],
            ["bwa_mem", 12],
            ["radia", 8],
            ['radia_filter', 8]
        ]
    )
    with open("%s.service" % (args.out_base), "w") as handle:
        s = service.get_config()
        if args.scratch:
            print "Using scratch", args.scratch
            s.set_docstore_config(cache_path=args.scratch, open_perms=True)
        s.store(handle)
Exemple #18
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    if args.alt_table is not None:
        config['table_id'] = args.alt_table

    docstore = from_url(args.out_base)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' %
                                  (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" %
                                          (ent.path, path),
                                          shell=True)
                    name = name.replace(".gz", "")
                else:
                    print "Unknown DataPrep"
            else:
                shutil.copy(ent.path, path)
            docstore.update_from_file(t)
            meta = {}
            meta['name'] = name
            meta['uuid'] = id
            if 'dataPrep' in meta:
                del meta['dataPrep']
            docstore.put(id, meta)

    data_mapping = {
        "dbsnp": "dbsnp_132_b37.leftAligned.vcf",
        "cosmic": "b37_cosmic_v54_120711.vcf",
        "gold_indels":
        "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf"
    }

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    workflow_2 = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga")
    workflow_3 = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga")

    ref_rename = {"HG19_Broad_variant": "Homo_sapiens_assembly19"}

    tasks = TaskGroup()

    for ent in synqueue.listAssignments(syn, **config):
        bam_set = list(
            a[1] for a in ent['meta'].items()
            if a[0].startswith("id_") and isinstance(a[1], basestring))

        ref_set = set(a[1] for a in ent['meta'].items()
                      if a[0].startswith("ref_assembly_")
                      and isinstance(a[1], basestring))
        assert (len(ref_set) == 1)
        ref_name = ref_set.pop()
        if ref_name in ref_rename:
            ref_name = ref_rename[ref_name]

        hit = None
        for a in docstore.filter(name=ref_name + ".fasta"):
            hit = a[0]
        for a in docstore.filter(name=ref_name + ".fa"):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (ref_name))
        workflow_dm = dict(dm)
        workflow_dm['reference_genome'] = {"uuid": hit}
        if len(bam_set) == 2:
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow_2,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1': {
                        "uuid": bam_set[0],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2': {
                        "uuid": bam_set[1],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    }
                },
                tags=["donor:%s" % (ent['meta']['participant_id'])],
                tool_tags={
                    "BQSR_1": {
                        "output_bam": ["original_bam:%s" % (bam_set[0])]
                    },
                    "BQSR_2": {
                        "output_bam": ["original_bam:%s" % (bam_set[1])]
                    }
                })
            tasks.append(task)
        elif len(bam_set) == 3:
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow_3,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1': {
                        "uuid": bam_set[0],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2': {
                        "uuid": bam_set[1],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_3': {
                        "uuid": bam_set[2],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    }
                },
                tags=["donor:%s" % (ent['meta']['participant_id'])],
                tool_tags={
                    "BQSR_1": {
                        "output_bam": ["original_bam:%s" % (bam_set[0])]
                    },
                    "BQSR_2": {
                        "output_bam": ["original_bam:%s" % (bam_set[1])]
                    },
                    "BQSR_3": {
                        "output_bam": ["original_bam:%s" % (bam_set[2])]
                    }
                })
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id),
                  "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:

        service = GalaxyService(docstore=docstore,
                                galaxy="bgruening/galaxy-stable",
                                sudo=True,
                                tool_data=args.tool_data,
                                tool_dir=args.tool_dir,
                                work_dir=args.work_dir,
                                smp=[["gatk_bqsr", 12], ["gatk_indel", 24]])
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)
Exemple #19
0
REFDATA_PROJECT="syn3241088"


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("docstore", help="DocStore")
    parser.add_argument("--ref-download", action="store_true", default=False)
    parser.add_argument("--create-service", action="store_true", default=False)

    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.docstore)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                    name = name.replace(".gz", "")
Exemple #20
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    if args.alt_table is not None:
        config['table_id'] = args.alt_table

    docstore = from_url(args.out_base)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                    name = name.replace(".gz", "")
                else:
                    print "Unknown DataPrep"
            else:
                shutil.copy(ent.path, path)
            docstore.update_from_file(t)
            meta = {}
            meta['name'] = name
            meta['uuid'] = id
            if 'dataPrep' in meta:
                del meta['dataPrep']
            docstore.put(id, meta)

    data_mapping = {
        "dbsnp" : "dbsnp_132_b37.leftAligned.vcf",
        "cosmic" : "b37_cosmic_v54_120711.vcf",
        "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf"
    }

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    workflow_2 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga")
    workflow_3 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga")

    ref_rename = {
        "HG19_Broad_variant" : "Homo_sapiens_assembly19"
    }

    tasks = TaskGroup()

    for ent in synqueue.listAssignments(syn, **config):
        bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring)  )

        ref_set = set( a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring) )
        assert(len(ref_set) == 1)
        ref_name = ref_set.pop()
        if ref_name in ref_rename:
            ref_name = ref_rename[ref_name]

        hit = None
        for a in docstore.filter(name=ref_name + ".fasta"):
            hit = a[0]
        for a in docstore.filter(name=ref_name + ".fa"):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (ref_name))
        workflow_dm = dict(dm)
        workflow_dm['reference_genome'] = { "uuid" : hit }
        if len(bam_set) == 2:
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow_2,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1' : {
                        "uuid" : bam_set[0],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2' : {
                        "uuid" : bam_set[1],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['participant_id']) ],
                tool_tags = {
                    "BQSR_1" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[0]) ]
                    },
                    "BQSR_2" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[1]) ]
                    }

                }
            )
            tasks.append(task)
        elif len(bam_set) == 3:
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow_3,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1' : {
                        "uuid" : bam_set[0],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2' : {
                        "uuid" : bam_set[1],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_3' : {
                        "uuid" : bam_set[2],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['participant_id']) ],
                tool_tags = {
                    "BQSR_1" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[0]) ]
                    },
                    "BQSR_2" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[1]) ]
                    },
                    "BQSR_3" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[2]) ]
                    }
                }
            )
            tasks.append(task)


    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:

        service = GalaxyService(
            docstore=docstore,
            galaxy="bgruening/galaxy-stable",
            sudo=True,
            tool_data=args.tool_data,
            tool_dir=args.tool_dir,
            work_dir=args.work_dir,
            smp=[
                ["gatk_bqsr", 12],
                ["gatk_indel", 24]
            ]
        )
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)
def run_uploadprep(args):

    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {
        'broad' : {},
        'muse' : {},
        'broad_tar' : {}
    }

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']

    uuid_map = {}
    uuid_map['broad'] = synqueue.getValues(syn, "Broad_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config)
    uuid_map['muse']  = synqueue.getValues(syn, "Muse_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config)
    uuid_map['broad_tar'] = synqueue.getValues(syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config)

    #scan through all of the docs
    for id, entry in doc.filter():
        donor = None
        #look for docs with donor tags
        if 'tags' in entry and 'state' in entry and entry['state'] == 'ok':
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None and donor in wl_map:
            if donor not in job_map:
                job_map[donor] = {}
            #scan out the job metrics for this job
            if 'job' in entry and 'job_metrics' in entry['job']:
                job_id = entry['job']['id']
                tool_id = entry['job']['tool_id']
                job_info = { tool_id : {} }
                for met in entry['job']['job_metrics']:
                    job_info[tool_id][met['name']] = met['raw_value']
                job_map[donor][job_id] = job_info
            donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']
            #look for the vcf output files
            if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                #fill out the info depending on which caller created the file
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    #if the files wasn't compressed already, go ahead and do that
                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True )
                        dst_file = dst_file + ".gz"

                    #add file to output map
                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []
                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)
            else:
                if entry['name'] == "broad.tar.gz":
                    target = Target(uuid=entry['uuid'])
                    src_file = doc.get_filename(target)
                    file_map['broad_tar'][donor] = [ src_file ]


    timing_map = {}
    for donor in job_map:
        timing_map[donor] = {}
        for job_id in job_map[donor]:
            for tool_id in job_map[donor][job_id]:
                if tool_id not in timing_map[donor]:
                    timing_map[donor][tool_id] = []
                timing_map[donor][tool_id].append( job_map[donor][job_id][tool_id] )

    result_counts = {}
    for pipeline, donors in file_map.items():
        for donor in donors:
            result_counts[donor] = result_counts.get(donor, 0) + 1

    #go through every pipeline
    for pipeline, donors in file_map.items():
        #for that pipeline go through every donor
        for donor, files in donors.items():
            #we're only outputing data for donors on the work list
            if donor in wl_map and result_counts[donor] == 3:
                #output the timing json
                timing_json = os.path.abspath(os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor)))
                with open( timing_json, "w" ) as handle:
                    handle.write(json.dumps( timing_map[donor] ) )

                #output the uploader script
                with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']),
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']

                    if pipeline in ['broad', 'muse']:
                        prep_cmd_str = ""
                        for vcf in files:
                            prep_cmd_str += "tabix -p vcf %s\n" % (vcf)
                            prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf,vcf)
                            prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % (vcf, vcf)
                            prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % (vcf, vcf)

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (",".join(urls))
                        submit_cmd_str += " --vcfs %s " % (",".join(files))
                        submit_cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                        submit_cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                        submit_cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --vm-location-code %s" % args.vm_location_code
                        submit_cmd_str += " --timing-metrics-json %s" % (timing_json)
                        submit_cmd_str += " --workflow-file-subset %s" % (pipeline)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    if pipeline in ['broad_tar']:
                        prep_cmd_str = ""
                        new_files = []
                        for tar in files:
                            basename = donor_tumor + ".broad.intermediate"
                            prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % (
                                os.path.dirname(os.path.abspath(__file__)),
                                tar,
                                "./",
                                basename,
                                donor, donor_tumor
                            )
                            new_files.append( basename + ".tar" )

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (",".join(urls))
                        submit_cmd_str += " --tarballs %s " % (",".join(new_files))
                        submit_cmd_str += " --tarball-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in new_files) )))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --workflow-file-subset %s" % (pipeline)
                        submit_cmd_str += " --timing-metrics-json %s" % (timing_json)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    handle.write(string.Template("""#!/bin/bash
set -ex
${PREP}
${SUBMIT}
echo $$? > $$0.submitted
#pushd ${SUBMIT_DIR}
#gtupload -v -c ${KEY} -u ./manifest.xml
#ECODE=$$?
#popd
#echo $$ECODE > $$0.uploaded
""").substitute(PREP=prep_cmd_str, SUBMIT=submit_cmd_str,
                            SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf", pipeline + "." + donor_tumor + ".dir", uuid_map[pipeline][donor] ),
                            KEY=args.keyfile
                    ) )
Exemple #22
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "db_snp" : "dbsnp_132_b37.leftAligned.vcf",
        "centromere" : "centromere_hg19.bed",
        "reference_genome" : "Homo_sapiens_assembly19.fasta",
        "cosmic" : "b37_cosmic_v54_120711.vcf"
    }

    if args.ref_download:
        syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values())

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    if args.sample is not None:
        sync_doc_dir(
            os.path.join( os.path.dirname(__file__), "..", "testexomes" ), docstore,
            filter=lambda x: x['donorId'] in args.sample
        )
    else:
        sync_doc_dir( os.path.join( os.path.dirname(__file__), "..", "testexomes" ), docstore)

    tumor_uuids = {}
    normal_uuids = {}

    for id, ent in docstore.filter(sampleType="tumour"):
        tumor_uuids[ent['participant_id']] = id

    for id, ent in docstore.filter(sampleType="normal"):
        normal_uuids[ent['participant_id']] = id

    mc3_workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_Test.ga")

    reference_id = None
    for a in docstore.filter(name="Homo_sapiens_assembly19.fasta"):
        reference_id = a[0]

    tasks = TaskGroup()
    for donor in tumor_uuids:
        if donor in normal_uuids:
            print "participant", donor

            donor_name = None
            for k,v in fake_metadata.items():
                if v['participant_id'] == donor:
                    donor_name = k

            workflow_dm = dict(dm)
            workflow_dm['tumor_bam'] = { "uuid" : tumor_uuids[donor] }
            workflow_dm['normal_bam'] = { "uuid" : normal_uuids[donor] }

            task = GalaxyWorkflowTask("workflow_%s" % (donor),
                mc3_workflow,
                inputs=workflow_dm,
                parameters={
                    "reheader_config" : {
                        "platform" : "Illumina",
                        "center" : "OHSU",
                        "reference_genome" : "Homo_sapiens_assembly19.fasta",
                        "participant_uuid" : fake_metadata[donor_name]['participant_id'],
                        "disease_code" : fake_metadata[donor_name]['disease'],
                        "filedate" : datetime.datetime.now().strftime("%Y%m%d"),
                        "normal_analysis_uuid" : fake_metadata[donor_name]['normal']['uuid'],
                        "normal_bam_name" : fake_metadata[donor_name]['normal']['file_name'],
                        "normal_aliquot_uuid" : fake_metadata[donor_name]['normal']['aliquot_id'],
                        "normal_aliquot_barcode": fake_metadata[donor_name]['normal']['barcode'],
                        "tumor_analysis_uuid" : fake_metadata[donor_name]['tumour']['uuid'],
                        "tumor_bam_name" : fake_metadata[donor_name]['tumour']['file_name'],
                        "tumor_aliquot_uuid" : fake_metadata[donor_name]['tumour']['aliquot_id'],
                        "tumor_aliquot_barcode" : fake_metadata[donor_name]['tumour']['barcode'],
                    }
                },
                tags=[ "donor:%s" % (donor) ],
            )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:
        service = GalaxyService(
            docstore=docstore,
            galaxy=args.galaxy,
            sudo=args.sudo,
            tool_data=args.tool_data,
            tool_dir=args.tool_dir,
            work_dir=args.work_dir,
            smp=[
                ["gatk_bqsr", 12],
                ["gatk_indel", 24],
                ["MuSE", 8],
                ["pindel", 8],
                ["mutect", 8],
                ["delly", 4],
                ["gatk_bqsr", 12],
                ["gatk_indel", 12],
                ["bwa_mem", 12],
                ["radia", 8],
                ['radia_filter', 8]
            ]
        )
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)
Exemple #23
0
def run_uploadprep(args):

    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {'broad': {}, 'muse': {}, 'broad_tar': {}}

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']

    uuid_map = {}
    uuid_map['broad'] = synqueue.getValues(syn,
                                           "Broad_VCF_UUID",
                                           orSet=lambda x: str(uuid.uuid4()),
                                           **config)
    uuid_map['muse'] = synqueue.getValues(syn,
                                          "Muse_VCF_UUID",
                                          orSet=lambda x: str(uuid.uuid4()),
                                          **config)
    uuid_map['broad_tar'] = synqueue.getValues(
        syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config)

    #scan through all of the docs
    for id, entry in doc.filter():
        donor = None
        #look for docs with donor tags
        if 'tags' in entry and 'state' in entry and entry['state'] == 'ok':
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None and donor in wl_map:
            if donor not in job_map:
                job_map[donor] = {}
            #scan out the job metrics for this job
            if 'job' in entry and 'job_metrics' in entry['job']:
                job_id = entry['job']['id']
                tool_id = entry['job']['tool_id']
                job_info = {tool_id: {}}
                for met in entry['job']['job_metrics']:
                    job_info[tool_id][met['name']] = met['raw_value']
                job_map[donor][job_id] = job_info
            donor_tumor = wl_map[donor][
                'Tumour_WGS_alignment_GNOS_analysis_IDs']
            #look for the vcf output files
            if entry.get('visible', False) and entry.get(
                    'extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                #fill out the info depending on which caller created the file
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in [
                        'broad-dRanger', 'broad-dRanger_snowman',
                        'broad-snowman', 'broad-mutect'
                ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" %
                                        (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr,
                                           variant_type, call_type)

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    #if the files wasn't compressed already, go ahead and do that
                    if entry['extension'] == 'vcf':
                        subprocess.check_call("bgzip -c %s > %s.gz" %
                                              (dst_file, dst_file),
                                              shell=True)
                        dst_file = dst_file + ".gz"

                    #add file to output map
                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []
                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)
            else:
                if entry['name'] == "broad.tar.gz":
                    target = Target(uuid=entry['uuid'])
                    src_file = doc.get_filename(target)
                    file_map['broad_tar'][donor] = [src_file]

    timing_map = {}
    for donor in job_map:
        timing_map[donor] = {}
        for job_id in job_map[donor]:
            for tool_id in job_map[donor][job_id]:
                if tool_id not in timing_map[donor]:
                    timing_map[donor][tool_id] = []
                timing_map[donor][tool_id].append(
                    job_map[donor][job_id][tool_id])

    result_counts = {}
    for pipeline, donors in file_map.items():
        for donor in donors:
            result_counts[donor] = result_counts.get(donor, 0) + 1

    #go through every pipeline
    for pipeline, donors in file_map.items():
        #for that pipeline go through every donor
        for donor, files in donors.items():
            #we're only outputing data for donors on the work list
            if donor in wl_map and result_counts[donor] == 3:
                #output the timing json
                timing_json = os.path.abspath(
                    os.path.join(args.workdir,
                                 "%s.%s.timing.json" % (pipeline, donor)))
                with open(timing_json, "w") as handle:
                    handle.write(json.dumps(timing_map[donor]))

                #output the uploader script
                with open(
                        os.path.join(args.workdir,
                                     "%s.%s.sh" % (pipeline, donor)),
                        "w") as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" %
                        (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'],
                         wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']
                         ),
                        "%scghub/metadata/analysisFull/%s" %
                        (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'],
                         wl_map[donor]
                         ['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    donor_tumor = wl_map[donor][
                        'Tumour_WGS_alignment_GNOS_analysis_IDs']

                    if pipeline in ['broad', 'muse']:
                        prep_cmd_str = ""
                        for vcf in files:
                            prep_cmd_str += "tabix -p vcf %s\n" % (vcf)
                            prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf, vcf)
                            prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % (
                                vcf, vcf)
                            prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % (
                                vcf, vcf)

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (
                            ",".join(urls))
                        submit_cmd_str += " --vcfs %s " % (",".join(files))
                        submit_cmd_str += " --vcf-md5sum-files %s " % (
                            (",".join(("%s.md5" % i for i in files))))
                        submit_cmd_str += " --vcf-idxs %s" % ((",".join(
                            ("%s.idx" % i for i in files))))
                        submit_cmd_str += " --vcf-idx-md5sum-files %s" % (
                            (",".join(("%s.idx.md5" % i for i in files))))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline,
                                                                   donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (
                            args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (
                            args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --vm-location-code %s" % args.vm_location_code
                        submit_cmd_str += " --timing-metrics-json %s" % (
                            timing_json)
                        submit_cmd_str += " --workflow-file-subset %s" % (
                            pipeline)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (
                            ",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (
                            uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    if pipeline in ['broad_tar']:
                        prep_cmd_str = ""
                        new_files = []
                        for tar in files:
                            basename = donor_tumor + ".broad.intermediate"
                            prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % (
                                os.path.dirname(os.path.abspath(__file__)),
                                tar, "./", basename, donor, donor_tumor)
                            new_files.append(basename + ".tar")

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (
                            ",".join(urls))
                        submit_cmd_str += " --tarballs %s " % (
                            ",".join(new_files))
                        submit_cmd_str += " --tarball-md5sum-files %s " % (
                            (",".join(("%s.md5" % i for i in new_files))))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline,
                                                                   donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (
                            args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (
                            args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --workflow-file-subset %s" % (
                            pipeline)
                        submit_cmd_str += " --timing-metrics-json %s" % (
                            timing_json)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (
                            ",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (
                            uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    handle.write(
                        string.Template("""#!/bin/bash
set -ex
${PREP}
${SUBMIT}
echo $$? > $$0.submitted
#pushd ${SUBMIT_DIR}
#gtupload -v -c ${KEY} -u ./manifest.xml
#ECODE=$$?
#popd
#echo $$ECODE > $$0.uploaded
""").substitute(PREP=prep_cmd_str,
                    SUBMIT=submit_cmd_str,
                    SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf",
                                        pipeline + "." + donor_tumor + ".dir",
                                        uuid_map[pipeline][donor]),
                    KEY=args.keyfile))
Exemple #24
0
def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "db_snp": "dbsnp_132_b37.leftAligned.vcf",
        "centromere": "centromere_hg19.bed",
        "reference_genome": "Homo_sapiens_assembly19.fasta",
        "cosmic": "b37_cosmic_v54_120711.vcf"
    }

    if args.ref_download:
        syn_sync(syn, REFDATA_PROJECT, docstore, data_mapping.values())

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    if args.sample is not None:
        sync_doc_dir(os.path.join(os.path.dirname(__file__), "..",
                                  "testexomes"),
                     docstore,
                     filter=lambda x: x['donorId'] in args.sample)
    else:
        sync_doc_dir(
            os.path.join(os.path.dirname(__file__), "..", "testexomes"),
            docstore)

    tumor_uuids = {}
    normal_uuids = {}

    for id, ent in docstore.filter(sampleType="tumour"):
        tumor_uuids[ent['participant_id']] = id

    for id, ent in docstore.filter(sampleType="normal"):
        normal_uuids[ent['participant_id']] = id

    mc3_workflow = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-MC3_Pipeline_Test.ga")

    reference_id = None
    for a in docstore.filter(name="Homo_sapiens_assembly19.fasta"):
        reference_id = a[0]

    tasks = TaskGroup()
    for donor in tumor_uuids:
        if donor in normal_uuids:
            print "participant", donor

            donor_name = None
            for k, v in fake_metadata.items():
                if v['participant_id'] == donor:
                    donor_name = k

            workflow_dm = dict(dm)
            workflow_dm['tumor_bam'] = {"uuid": tumor_uuids[donor]}
            workflow_dm['normal_bam'] = {"uuid": normal_uuids[donor]}

            task = GalaxyWorkflowTask(
                "workflow_%s" % (donor),
                mc3_workflow,
                inputs=workflow_dm,
                parameters={
                    "reheader_config": {
                        "platform":
                        "Illumina",
                        "center":
                        "OHSU",
                        "reference_genome":
                        "Homo_sapiens_assembly19.fasta",
                        "participant_uuid":
                        fake_metadata[donor_name]['participant_id'],
                        "disease_code":
                        fake_metadata[donor_name]['disease'],
                        "filedate":
                        datetime.datetime.now().strftime("%Y%m%d"),
                        "normal_analysis_uuid":
                        fake_metadata[donor_name]['normal']['uuid'],
                        "normal_bam_name":
                        fake_metadata[donor_name]['normal']['file_name'],
                        "normal_aliquot_uuid":
                        fake_metadata[donor_name]['normal']['aliquot_id'],
                        "normal_aliquot_barcode":
                        fake_metadata[donor_name]['normal']['barcode'],
                        "tumor_analysis_uuid":
                        fake_metadata[donor_name]['tumour']['uuid'],
                        "tumor_bam_name":
                        fake_metadata[donor_name]['tumour']['file_name'],
                        "tumor_aliquot_uuid":
                        fake_metadata[donor_name]['tumour']['aliquot_id'],
                        "tumor_aliquot_barcode":
                        fake_metadata[donor_name]['tumour']['barcode'],
                    }
                },
                tags=["donor:%s" % (donor)],
            )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id),
                  "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:
        service = GalaxyService(docstore=docstore,
                                galaxy=args.galaxy,
                                sudo=args.sudo,
                                tool_data=args.tool_data,
                                tool_dir=args.tool_dir,
                                work_dir=args.work_dir,
                                smp=[["gatk_bqsr", 12], ["gatk_indel", 24],
                                     ["MuSE", 8], ["pindel", 8], ["mutect", 8],
                                     ["delly", 4], ["gatk_bqsr", 12],
                                     ["gatk_indel", 12], ["bwa_mem", 12],
                                     ["radia", 8], ['radia_filter', 8]])
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)
Exemple #25
0
def run_gen(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "reference_genome": "genome.fa",
        "dbsnp": "dbsnp_132_b37.leftAligned.vcf",
        "cosmic": "b37_cosmic_v54_120711.vcf",
        "gold_indels":
        "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf",
        "centromere": "centromere_hg19.bed"
    }

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' %
                                  (REFDATA_PROJECT)):
            print "found", a['entity.name']
            if a['entity.name'] in data_mapping.values(
            ) or a['entity.name'].replace(".gz", "") in data_mapping.values():
                print "loading"
                ent = syn.get(a['entity.id'])
                id = ent.annotations['uuid'][0]
                t = Target(uuid=id)
                docstore.create(t)
                path = docstore.get_filename(t)
                name = ent.name
                if 'dataPrep' in ent.annotations:
                    if ent.annotations['dataPrep'][0] == 'gunzip':
                        subprocess.check_call("gunzip -c %s > %s" %
                                              (ent.path, path),
                                              shell=True)
                        name = name.replace(".gz", "")
                    else:
                        print "Unknown DataPrep"
                else:
                    shutil.copy(ent.path, path)
                docstore.update_from_file(t)
                meta = {}
                meta['name'] = name
                meta['uuid'] = id
                if 'dataPrep' in meta:
                    del meta['dataPrep']
                docstore.put(id, meta)

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    workflow = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga")
    tasks = TaskGroup()
    for ent in synqueue.listAssignments(syn, **config):
        #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan')
        if not isinstance(ent['state'], basestring) and isnan(ent['state']):
            gnos_endpoint = urlparse(
                ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow,
                inputs=dm,
                parameters={
                    'normal_bam_download': {
                        "uuid":
                        ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'],
                        "gnos_endpoint":
                        gnos_endpoint,
                        "cred_file":
                        key_map[gnos_endpoint]
                    },
                    'tumor_bam_download': {
                        "uuid":
                        ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'],
                        "gnos_endpoint":
                        gnos_endpoint,
                        "cred_file":
                        key_map[gnos_endpoint]
                    },
                    'broad_variant_pipeline': {
                        "broad_ref_dir": "/tool_data/files/refdata",
                        "sample_id": ent['meta']['Submitter_donor_ID']
                    }
                },
                tags=["donor:%s" % (ent['meta']['Submitter_donor_ID'])])
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))

    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id),
                  "w") as handle:
            handle.write(json.dumps(data.to_dict()))
            state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id)
            if os.path.exists(state_file):
                os.unlink(state_file)

    print "Tasks Created: %s" % (len(tasks))

    if args.create_service:
        service = GalaxyService(docstore=docstore,
                                galaxy="bgruening/galaxy-stable",
                                sudo=args.sudo,
                                tool_data=os.path.abspath("tool_data"),
                                tool_dir=os.path.abspath("tools"),
                                work_dir=args.work_dir,
                                smp=[["MuSE", 8], ["pindel", 8], ["muTect", 8],
                                     ["delly", 4], ["gatk_bqsr", 12],
                                     ["gatk_indel", 24], ["bwa_mem", 12],
                                     ["broad_variant_pipline", 24]])
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)