Python listAssignments Examples

Programming Language: Python

Namespace/Package Name: synqueue

Method/Function: listAssignments

Examples at hotexamples.com: 14

Python listAssignments - 14 examples found. These are the top rated real world Python examples of synqueue.listAssignments extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def run_audit(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    for id, ent in docstore.filter(state="ok"):
        if ent['visible']:
            if docstore.size(Target(id)) > 0:
                donor = None
                for i in ent['tags']:
                    t = i.split(":")
                    if t[0] == "donor":
                        donor = t[1]
                if donor not in donor_map:
                    donor_map[donor] = {}
                donor_map[donor][ent['name']] = id

    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        if ent['meta']['Submitter_donor_ID'] in donor_map:
            print ent['meta']['Submitter_donor_ID'], len(
                donor_map[ent['meta']['Submitter_donor_ID']])

Example #2

Show file

File: pcawg_wf_gen.py Project: ICGC-TCGA-PanCancer/pcawg_tools

def run_audit(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    donor_map = {}
    for id, ent in docstore.filter( state="ok" ):
        if ent['visible']:
            if docstore.size(Target(id)) > 0:
                donor = None
                for i in ent['tags']:
                    t = i.split(":")
                    if t[0] == "donor":
                        donor = t[1]
                if donor not in donor_map:
                    donor_map[donor] = {}
                donor_map[donor][ent['name']] = id

    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        if ent['meta']['Submitter_donor_ID'] in donor_map:
            print ent['meta']['Submitter_donor_ID'], len(donor_map[ent['meta']['Submitter_donor_ID']])

Example #3

Show file

File: mc3_gatk_wf_gen.py Project: gaurav-kaushik/mc3

def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    if args.alt_table is not None:
        config['table_id'] = args.alt_table

    docstore = from_url(args.out_base)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                    name = name.replace(".gz", "")
                else:
                    print "Unknown DataPrep"
            else:
                shutil.copy(ent.path, path)
            docstore.update_from_file(t)
            meta = {}
            meta['name'] = name
            meta['uuid'] = id
            if 'dataPrep' in meta:
                del meta['dataPrep']
            docstore.put(id, meta)

    data_mapping = {
        "dbsnp" : "dbsnp_132_b37.leftAligned.vcf",
        "cosmic" : "b37_cosmic_v54_120711.vcf",
        "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf"
    }

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    workflow_2 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga")
    workflow_3 = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga")

    ref_rename = {
        "HG19_Broad_variant" : "Homo_sapiens_assembly19"
    }

    tasks = TaskGroup()

    for ent in synqueue.listAssignments(syn, **config):
        bam_set = list( a[1] for a in ent['meta'].items() if a[0].startswith("id_") and isinstance(a[1], basestring)  )

        ref_set = set( a[1] for a in ent['meta'].items() if a[0].startswith("ref_assembly_") and isinstance(a[1], basestring) )
        assert(len(ref_set) == 1)
        ref_name = ref_set.pop()
        if ref_name in ref_rename:
            ref_name = ref_rename[ref_name]

        hit = None
        for a in docstore.filter(name=ref_name + ".fasta"):
            hit = a[0]
        for a in docstore.filter(name=ref_name + ".fa"):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (ref_name))
        workflow_dm = dict(dm)
        workflow_dm['reference_genome'] = { "uuid" : hit }
        if len(bam_set) == 2:
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow_2,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1' : {
                        "uuid" : bam_set[0],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2' : {
                        "uuid" : bam_set[1],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['participant_id']) ],
                tool_tags = {
                    "BQSR_1" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[0]) ]
                    },
                    "BQSR_2" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[1]) ]
                    }

                }
            )
            tasks.append(task)
        elif len(bam_set) == 3:
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow_3,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1' : {
                        "uuid" : bam_set[0],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2' : {
                        "uuid" : bam_set[1],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_3' : {
                        "uuid" : bam_set[2],
                        "gnos_endpoint" : "cghub.ucsc.edu",
                        "cred_file" : "/tool_data/files/cghub.key"
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['participant_id']) ],
                tool_tags = {
                    "BQSR_1" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[0]) ]
                    },
                    "BQSR_2" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[1]) ]
                    },
                    "BQSR_3" : {
                        "output_bam" : [ "original_bam:%s" % (bam_set[2]) ]
                    }
                }
            )
            tasks.append(task)


    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:

        service = GalaxyService(
            docstore=docstore,
            galaxy="bgruening/galaxy-stable",
            sudo=True,
            tool_data=args.tool_data,
            tool_dir=args.tool_dir,
            work_dir=args.work_dir,
            smp=[
                ["gatk_bqsr", 12],
                ["gatk_indel", 24]
            ]
        )
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)

Example #4

Show file

File: mc3_gatk_wf_gen.py Project: gaurav-kaushik/mc3

def run_list(args):
    syn = synapseclient.Synapse()
    syn.login()
    if args.alt_table is not None:
        config['table_id'] = args.alt_table
    synqueue.listAssignments(syn, display=True, **config)

Example #5

Show file

def run_list(args):
    syn = synapseclient.Synapse()
    syn.login()
    synqueue.listAssignments(syn, display=True, **config)

Example #6

Show file

def run_gen(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "reference_genome": "genome.fa",
        "dbsnp": "dbsnp_132_b37.leftAligned.vcf",
        "cosmic": "b37_cosmic_v54_120711.vcf",
        "gold_indels":
        "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf",
        "centromere": "centromere_hg19.bed"
    }

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' %
                                  (REFDATA_PROJECT)):
            print "found", a['entity.name']
            if a['entity.name'] in data_mapping.values(
            ) or a['entity.name'].replace(".gz", "") in data_mapping.values():
                print "loading"
                ent = syn.get(a['entity.id'])
                id = ent.annotations['uuid'][0]
                t = Target(uuid=id)
                docstore.create(t)
                path = docstore.get_filename(t)
                name = ent.name
                if 'dataPrep' in ent.annotations:
                    if ent.annotations['dataPrep'][0] == 'gunzip':
                        subprocess.check_call("gunzip -c %s > %s" %
                                              (ent.path, path),
                                              shell=True)
                        name = name.replace(".gz", "")
                    else:
                        print "Unknown DataPrep"
                else:
                    shutil.copy(ent.path, path)
                docstore.update_from_file(t)
                meta = {}
                meta['name'] = name
                meta['uuid'] = id
                if 'dataPrep' in meta:
                    del meta['dataPrep']
                docstore.put(id, meta)

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    workflow = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga")
    tasks = TaskGroup()
    for ent in synqueue.listAssignments(syn, **config):
        #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan')
        if not isinstance(ent['state'], basestring) and isnan(ent['state']):
            gnos_endpoint = urlparse(
                ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow,
                inputs=dm,
                parameters={
                    'normal_bam_download': {
                        "uuid":
                        ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'],
                        "gnos_endpoint":
                        gnos_endpoint,
                        "cred_file":
                        key_map[gnos_endpoint]
                    },
                    'tumor_bam_download': {
                        "uuid":
                        ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'],
                        "gnos_endpoint":
                        gnos_endpoint,
                        "cred_file":
                        key_map[gnos_endpoint]
                    },
                    'broad_variant_pipeline': {
                        "broad_ref_dir": "/tool_data/files/refdata",
                        "sample_id": ent['meta']['Submitter_donor_ID']
                    }
                },
                tags=["donor:%s" % (ent['meta']['Submitter_donor_ID'])])
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))

    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id),
                  "w") as handle:
            handle.write(json.dumps(data.to_dict()))
            state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id)
            if os.path.exists(state_file):
                os.unlink(state_file)

    print "Tasks Created: %s" % (len(tasks))

    if args.create_service:
        service = GalaxyService(docstore=docstore,
                                galaxy="bgruening/galaxy-stable",
                                sudo=args.sudo,
                                tool_data=os.path.abspath("tool_data"),
                                tool_dir=os.path.abspath("tools"),
                                work_dir=args.work_dir,
                                smp=[["MuSE", 8], ["pindel", 8], ["muTect", 8],
                                     ["delly", 4], ["gatk_bqsr", 12],
                                     ["gatk_indel", 24], ["bwa_mem", 12],
                                     ["broad_variant_pipline", 24]])
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)

Example #7

Show file

def run_uploadprep(args):

    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {'broad': {}, 'muse': {}, 'broad_tar': {}}

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']

    uuid_map = {}
    uuid_map['broad'] = synqueue.getValues(syn,
                                           "Broad_VCF_UUID",
                                           orSet=lambda x: str(uuid.uuid4()),
                                           **config)
    uuid_map['muse'] = synqueue.getValues(syn,
                                          "Muse_VCF_UUID",
                                          orSet=lambda x: str(uuid.uuid4()),
                                          **config)
    uuid_map['broad_tar'] = synqueue.getValues(
        syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config)

    #scan through all of the docs
    for id, entry in doc.filter():
        donor = None
        #look for docs with donor tags
        if 'tags' in entry and 'state' in entry and entry['state'] == 'ok':
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None and donor in wl_map:
            if donor not in job_map:
                job_map[donor] = {}
            #scan out the job metrics for this job
            if 'job' in entry and 'job_metrics' in entry['job']:
                job_id = entry['job']['id']
                tool_id = entry['job']['tool_id']
                job_info = {tool_id: {}}
                for met in entry['job']['job_metrics']:
                    job_info[tool_id][met['name']] = met['raw_value']
                job_map[donor][job_id] = job_info
            donor_tumor = wl_map[donor][
                'Tumour_WGS_alignment_GNOS_analysis_IDs']
            #look for the vcf output files
            if entry.get('visible', False) and entry.get(
                    'extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                #fill out the info depending on which caller created the file
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in [
                        'broad-dRanger', 'broad-dRanger_snowman',
                        'broad-snowman', 'broad-mutect'
                ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" %
                                        (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr,
                                           variant_type, call_type)

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    #if the files wasn't compressed already, go ahead and do that
                    if entry['extension'] == 'vcf':
                        subprocess.check_call("bgzip -c %s > %s.gz" %
                                              (dst_file, dst_file),
                                              shell=True)
                        dst_file = dst_file + ".gz"

                    #add file to output map
                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []
                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)
            else:
                if entry['name'] == "broad.tar.gz":
                    target = Target(uuid=entry['uuid'])
                    src_file = doc.get_filename(target)
                    file_map['broad_tar'][donor] = [src_file]

    timing_map = {}
    for donor in job_map:
        timing_map[donor] = {}
        for job_id in job_map[donor]:
            for tool_id in job_map[donor][job_id]:
                if tool_id not in timing_map[donor]:
                    timing_map[donor][tool_id] = []
                timing_map[donor][tool_id].append(
                    job_map[donor][job_id][tool_id])

    result_counts = {}
    for pipeline, donors in file_map.items():
        for donor in donors:
            result_counts[donor] = result_counts.get(donor, 0) + 1

    #go through every pipeline
    for pipeline, donors in file_map.items():
        #for that pipeline go through every donor
        for donor, files in donors.items():
            #we're only outputing data for donors on the work list
            if donor in wl_map and result_counts[donor] == 3:
                #output the timing json
                timing_json = os.path.abspath(
                    os.path.join(args.workdir,
                                 "%s.%s.timing.json" % (pipeline, donor)))
                with open(timing_json, "w") as handle:
                    handle.write(json.dumps(timing_map[donor]))

                #output the uploader script
                with open(
                        os.path.join(args.workdir,
                                     "%s.%s.sh" % (pipeline, donor)),
                        "w") as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" %
                        (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'],
                         wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']
                         ),
                        "%scghub/metadata/analysisFull/%s" %
                        (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'],
                         wl_map[donor]
                         ['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    donor_tumor = wl_map[donor][
                        'Tumour_WGS_alignment_GNOS_analysis_IDs']

                    if pipeline in ['broad', 'muse']:
                        prep_cmd_str = ""
                        for vcf in files:
                            prep_cmd_str += "tabix -p vcf %s\n" % (vcf)
                            prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf, vcf)
                            prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % (
                                vcf, vcf)
                            prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % (
                                vcf, vcf)

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (
                            ",".join(urls))
                        submit_cmd_str += " --vcfs %s " % (",".join(files))
                        submit_cmd_str += " --vcf-md5sum-files %s " % (
                            (",".join(("%s.md5" % i for i in files))))
                        submit_cmd_str += " --vcf-idxs %s" % ((",".join(
                            ("%s.idx" % i for i in files))))
                        submit_cmd_str += " --vcf-idx-md5sum-files %s" % (
                            (",".join(("%s.idx.md5" % i for i in files))))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline,
                                                                   donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (
                            args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (
                            args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --vm-location-code %s" % args.vm_location_code
                        submit_cmd_str += " --timing-metrics-json %s" % (
                            timing_json)
                        submit_cmd_str += " --workflow-file-subset %s" % (
                            pipeline)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (
                            ",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (
                            uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    if pipeline in ['broad_tar']:
                        prep_cmd_str = ""
                        new_files = []
                        for tar in files:
                            basename = donor_tumor + ".broad.intermediate"
                            prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % (
                                os.path.dirname(os.path.abspath(__file__)),
                                tar, "./", basename, donor, donor_tumor)
                            new_files.append(basename + ".tar")

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (
                            ",".join(urls))
                        submit_cmd_str += " --tarballs %s " % (
                            ",".join(new_files))
                        submit_cmd_str += " --tarball-md5sum-files %s " % (
                            (",".join(("%s.md5" % i for i in new_files))))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline,
                                                                   donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (
                            args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (
                            args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --workflow-file-subset %s" % (
                            pipeline)
                        submit_cmd_str += " --timing-metrics-json %s" % (
                            timing_json)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (
                            ",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (
                            uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    handle.write(
                        string.Template("""#!/bin/bash
set -ex
${PREP}
${SUBMIT}
echo $$? > $$0.submitted
#pushd ${SUBMIT_DIR}
#gtupload -v -c ${KEY} -u ./manifest.xml
#ECODE=$$?
#popd
#echo $$ECODE > $$0.uploaded
""").substitute(PREP=prep_cmd_str,
                    SUBMIT=submit_cmd_str,
                    SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf",
                                        pipeline + "." + donor_tumor + ".dir",
                                        uuid_map[pipeline][donor]),
                    KEY=args.keyfile))

Example #8

Show file

File: pcawg_wf_gen.py Project: jhl667/pcawg_tools

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" %s (v))
        dm[k] = { "uuid" : hit }

    workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-PCAWG_BROAD_MUSE.ga")

    config = synqueue.find_config()

    tasks = TaskGroup()

    for ent in synqueue.listAssignments(syn, **config):
        task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
            workflow,
            inputs=dm,
            parameters={
                'normal_bam_download' : {
                    "uuid" : ent['meta']['Normal_Analysis_ID'],
                    "gnos_endpoint" : "https://cghub.ucsc.edu",
                    "cred_file" : "/tool_data/files/cghub.key"
                },
                'tumor_bam_download' : {
                    "uuid" : ent['meta']['Tumour_Analysis_ID'],
                    "gnos_endpoint" : "https://cghub.ucsc.edu",
                    "cred_file" : "/tool_data/files/cghub.key"
                },
                'broad_variant_pipeline' : {

Example #9

Show file

def run_gen(args):
    syn = synapseclient.Synapse()
    syn.login()

    if args.alt_table is not None:
        config['table_id'] = args.alt_table

    docstore = from_url(args.out_base)

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' %
                                  (REFDATA_PROJECT)):
            ent = syn.get(a['entity.id'])

            id = ent.annotations['uuid'][0]
            t = Target(uuid=id)
            docstore.create(t)
            path = docstore.get_filename(t)
            name = ent.name
            if 'dataPrep' in ent.annotations:
                if ent.annotations['dataPrep'][0] == 'gunzip':
                    subprocess.check_call("gunzip -c %s > %s" %
                                          (ent.path, path),
                                          shell=True)
                    name = name.replace(".gz", "")
                else:
                    print "Unknown DataPrep"
            else:
                shutil.copy(ent.path, path)
            docstore.update_from_file(t)
            meta = {}
            meta['name'] = name
            meta['uuid'] = id
            if 'dataPrep' in meta:
                del meta['dataPrep']
            docstore.put(id, meta)

    data_mapping = {
        "dbsnp": "dbsnp_132_b37.leftAligned.vcf",
        "cosmic": "b37_cosmic_v54_120711.vcf",
        "gold_indels":
        "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels": "1000G_phase1.indels.hg19.sites.fixed.vcf"
    }

    dm = {}
    for k, v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = {"uuid": hit}

    workflow_2 = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-GATK_CGHub_2.ga")
    workflow_3 = GalaxyWorkflow(
        ga_file="workflows/Galaxy-Workflow-GATK_CGHub_3.ga")

    ref_rename = {"HG19_Broad_variant": "Homo_sapiens_assembly19"}

    tasks = TaskGroup()

    for ent in synqueue.listAssignments(syn, **config):
        bam_set = list(
            a[1] for a in ent['meta'].items()
            if a[0].startswith("id_") and isinstance(a[1], basestring))

        ref_set = set(a[1] for a in ent['meta'].items()
                      if a[0].startswith("ref_assembly_")
                      and isinstance(a[1], basestring))
        assert (len(ref_set) == 1)
        ref_name = ref_set.pop()
        if ref_name in ref_rename:
            ref_name = ref_rename[ref_name]

        hit = None
        for a in docstore.filter(name=ref_name + ".fasta"):
            hit = a[0]
        for a in docstore.filter(name=ref_name + ".fa"):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (ref_name))
        workflow_dm = dict(dm)
        workflow_dm['reference_genome'] = {"uuid": hit}
        if len(bam_set) == 2:
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow_2,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1': {
                        "uuid": bam_set[0],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2': {
                        "uuid": bam_set[1],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    }
                },
                tags=["donor:%s" % (ent['meta']['participant_id'])],
                tool_tags={
                    "BQSR_1": {
                        "output_bam": ["original_bam:%s" % (bam_set[0])]
                    },
                    "BQSR_2": {
                        "output_bam": ["original_bam:%s" % (bam_set[1])]
                    }
                })
            tasks.append(task)
        elif len(bam_set) == 3:
            task = GalaxyWorkflowTask(
                "workflow_%s" % (ent['id']),
                workflow_3,
                inputs=workflow_dm,
                parameters={
                    'INPUT_BAM_1': {
                        "uuid": bam_set[0],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_2': {
                        "uuid": bam_set[1],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    },
                    'INPUT_BAM_3': {
                        "uuid": bam_set[2],
                        "gnos_endpoint": "cghub.ucsc.edu",
                        "cred_file": "/tool_data/files/cghub.key"
                    }
                },
                tags=["donor:%s" % (ent['meta']['participant_id'])],
                tool_tags={
                    "BQSR_1": {
                        "output_bam": ["original_bam:%s" % (bam_set[0])]
                    },
                    "BQSR_2": {
                        "output_bam": ["original_bam:%s" % (bam_set[1])]
                    },
                    "BQSR_3": {
                        "output_bam": ["original_bam:%s" % (bam_set[2])]
                    }
                })
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))
    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id),
                  "w") as handle:
            handle.write(json.dumps(data.to_dict()))

    if args.create_service:

        service = GalaxyService(docstore=docstore,
                                galaxy="bgruening/galaxy-stable",
                                sudo=True,
                                tool_data=args.tool_data,
                                tool_dir=args.tool_dir,
                                work_dir=args.work_dir,
                                smp=[["gatk_bqsr", 12], ["gatk_indel", 24]])
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)

Example #10

Show file

def run_list(args):
    syn = synapseclient.Synapse()
    syn.login()
    if args.alt_table is not None:
        config['table_id'] = args.alt_table
    synqueue.listAssignments(syn, display=True, **config)

Example #11

Show file

File: pcawg_wf_gen.py Project: ICGC-TCGA-PanCancer/pcawg_tools

def run_list(args):
    syn = synapseclient.Synapse()
    syn.login()
    synqueue.listAssignments(syn, display=True, **config)

Example #12

Show file

File: pcawg_wf_gen.py Project: ICGC-TCGA-PanCancer/pcawg_tools

def run_gen(args):
    args = parser.parse_args()

    syn = synapseclient.Synapse()
    syn.login()

    docstore = from_url(args.out_base)

    data_mapping = {
        "reference_genome" : "genome.fa",
        "dbsnp" : "dbsnp_132_b37.leftAligned.vcf",
        "cosmic" : "b37_cosmic_v54_120711.vcf",
        "gold_indels" : "Mills_and_1000G_gold_standard.indels.hg19.sites.fixed.vcf",
        "phase_one_indels" : "1000G_phase1.indels.hg19.sites.fixed.vcf",
        "centromere" : "centromere_hg19.bed"
    }

    if args.ref_download:
        #download reference files from Synapse and populate the document store
        for a in syn.chunkedQuery('select * from entity where parentId=="%s"' % (REFDATA_PROJECT)):
            print "found",  a['entity.name']
            if a['entity.name'] in data_mapping.values() or a['entity.name'].replace(".gz", "") in data_mapping.values():
                print "loading"
                ent = syn.get(a['entity.id'])
                id = ent.annotations['uuid'][0]
                t = Target(uuid=id)
                docstore.create(t)
                path = docstore.get_filename(t)
                name = ent.name
                if 'dataPrep' in ent.annotations:
                    if ent.annotations['dataPrep'][0] == 'gunzip':
                        subprocess.check_call("gunzip -c %s > %s" % (ent.path, path), shell=True)
                        name = name.replace(".gz", "")
                    else:
                        print "Unknown DataPrep"
                else:
                    shutil.copy(ent.path, path)
                docstore.update_from_file(t)
                meta = {}
                meta['name'] = name
                meta['uuid'] = id
                if 'dataPrep' in meta:
                    del meta['dataPrep']
                docstore.put(id, meta)

    dm = {}
    for k,v in data_mapping.items():
        hit = None
        for a in docstore.filter(name=v):
            hit = a[0]
        if hit is None:
            raise Exception("%s not found" % (v))
        dm[k] = { "uuid" : hit }

    workflow = GalaxyWorkflow(ga_file="workflows/Galaxy-Workflow-PCAWG_CGHUB.ga")
    tasks = TaskGroup()
    for ent in synqueue.listAssignments(syn, **config):
        #print "'%s'" % (ent['state']), ent['state'] == 'nan', type(ent['state']), type('nan')
        if not isinstance(ent['state'], basestring) and isnan(ent['state']):
            gnos_endpoint = urlparse(ent['meta']['Normal_WGS_alignment_GNOS_repos']).netloc
            task = GalaxyWorkflowTask("workflow_%s" % (ent['id']),
                workflow,
                inputs=dm,
                parameters={
                    'normal_bam_download' : {
                        "uuid" : ent['meta']['Normal_WGS_alignment_GNOS_analysis_ID'],
                        "gnos_endpoint" : gnos_endpoint,
                        "cred_file" : key_map[gnos_endpoint]
                    },
                    'tumor_bam_download' : {
                        "uuid" : ent['meta']['Tumour_WGS_alignment_GNOS_analysis_IDs'],
                        "gnos_endpoint" : gnos_endpoint,
                        "cred_file" : key_map[gnos_endpoint]
                    },
                    'broad_variant_pipeline' : {
                        "broad_ref_dir" : "/tool_data/files/refdata",
                        "sample_id" : ent['meta']['Submitter_donor_ID']
                    }
                },
                tags=[ "donor:%s" % (ent['meta']['Submitter_donor_ID']) ]
            )
            tasks.append(task)

    if not os.path.exists("%s.tasks" % (args.out_base)):
        os.mkdir("%s.tasks" % (args.out_base))

    for data in tasks:
        with open("%s.tasks/%s" % (args.out_base, data.task_id), "w") as handle:
            handle.write(json.dumps(data.to_dict()))
            state_file = "%s.tasks/%s.state" % (args.out_base, data.task_id)
            if os.path.exists( state_file ):
                os.unlink( state_file )

    print "Tasks Created: %s" % (len(tasks))

    if args.create_service:
        service = GalaxyService(
            docstore=docstore,
            galaxy="bgruening/galaxy-stable",
            sudo=args.sudo,
            tool_data=os.path.abspath("tool_data"),
            tool_dir=os.path.abspath("tools"),
            work_dir=args.work_dir,
            smp=[
                ["MuSE", 8],
                ["pindel", 8],
                ["muTect", 8],
                ["delly", 4],
                ["gatk_bqsr", 12],
                ["gatk_indel", 24],
                ["bwa_mem", 12],
                ["broad_variant_pipline", 24]
            ]
        )
        with open("%s.service" % (args.out_base), "w") as handle:
            s = service.get_config()
            if args.scratch:
                print "Using scratch", args.scratch
                s.set_docstore_config(cache_path=args.scratch, open_perms=True)
            s.store(handle)

Example #13

Show file

File: pcawg_wf_gen.py Project: ICGC-TCGA-PanCancer/pcawg_tools

def run_uploadprep(args):

    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {
        'broad' : {},
        'muse' : {},
        'broad_tar' : {}
    }

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']

    uuid_map = {}
    uuid_map['broad'] = synqueue.getValues(syn, "Broad_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config)
    uuid_map['muse']  = synqueue.getValues(syn, "Muse_VCF_UUID", orSet=lambda x: str(uuid.uuid4()), **config)
    uuid_map['broad_tar'] = synqueue.getValues(syn, "Broad_TAR_UUID", orSet=lambda x: str(uuid.uuid4()), **config)

    #scan through all of the docs
    for id, entry in doc.filter():
        donor = None
        #look for docs with donor tags
        if 'tags' in entry and 'state' in entry and entry['state'] == 'ok':
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None and donor in wl_map:
            if donor not in job_map:
                job_map[donor] = {}
            #scan out the job metrics for this job
            if 'job' in entry and 'job_metrics' in entry['job']:
                job_id = entry['job']['id']
                tool_id = entry['job']['tool_id']
                job_info = { tool_id : {} }
                for met in entry['job']['job_metrics']:
                    job_info[tool_id][met['name']] = met['raw_value']
                job_map[donor][job_id] = job_info
            donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']
            #look for the vcf output files
            if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                #fill out the info depending on which caller created the file
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor_tumor, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    #if the files wasn't compressed already, go ahead and do that
                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True )
                        dst_file = dst_file + ".gz"

                    #add file to output map
                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []
                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)
            else:
                if entry['name'] == "broad.tar.gz":
                    target = Target(uuid=entry['uuid'])
                    src_file = doc.get_filename(target)
                    file_map['broad_tar'][donor] = [ src_file ]


    timing_map = {}
    for donor in job_map:
        timing_map[donor] = {}
        for job_id in job_map[donor]:
            for tool_id in job_map[donor][job_id]:
                if tool_id not in timing_map[donor]:
                    timing_map[donor][tool_id] = []
                timing_map[donor][tool_id].append( job_map[donor][job_id][tool_id] )

    result_counts = {}
    for pipeline, donors in file_map.items():
        for donor in donors:
            result_counts[donor] = result_counts.get(donor, 0) + 1

    #go through every pipeline
    for pipeline, donors in file_map.items():
        #for that pipeline go through every donor
        for donor, files in donors.items():
            #we're only outputing data for donors on the work list
            if donor in wl_map and result_counts[donor] == 3:
                #output the timing json
                timing_json = os.path.abspath(os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor)))
                with open( timing_json, "w" ) as handle:
                    handle.write(json.dumps( timing_map[donor] ) )

                #output the uploader script
                with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']),
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    donor_tumor = wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs']

                    if pipeline in ['broad', 'muse']:
                        prep_cmd_str = ""
                        for vcf in files:
                            prep_cmd_str += "tabix -p vcf %s\n" % (vcf)
                            prep_cmd_str += "mv %s.tbi %s.idx\n" % (vcf,vcf)
                            prep_cmd_str += "md5sum %s | awk '{print$1}' > %s.md5\n" % (vcf, vcf)
                            prep_cmd_str += "md5sum %s.idx | awk '{print$1}' > %s.idx.md5\n\n" % (vcf, vcf)

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (",".join(urls))
                        submit_cmd_str += " --vcfs %s " % (",".join(files))
                        submit_cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                        submit_cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                        submit_cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --vm-location-code %s" % args.vm_location_code
                        submit_cmd_str += " --timing-metrics-json %s" % (timing_json)
                        submit_cmd_str += " --workflow-file-subset %s" % (pipeline)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    if pipeline in ['broad_tar']:
                        prep_cmd_str = ""
                        new_files = []
                        for tar in files:
                            basename = donor_tumor + ".broad.intermediate"
                            prep_cmd_str = "%s/remap_broad_tar.py %s %s %s --rename %s %s" % (
                                os.path.dirname(os.path.abspath(__file__)),
                                tar,
                                "./",
                                basename,
                                donor, donor_tumor
                            )
                            new_files.append( basename + ".tar" )

                        related_uuids = []
                        for p in uuid_map:
                            if p != pipeline:
                                related_uuids.append(uuid_map[p][donor])

                        submit_cmd_str = "perl -I /opt/gt-download-upload-wrapper/gt-download-upload-wrapper-2.0.11/lib"
                        submit_cmd_str += " /opt/vcf-uploader/vcf-uploader-2.0.5/gnos_upload_vcf.pl"
                        submit_cmd_str += " --metadata-urls %s" % (",".join(urls))
                        submit_cmd_str += " --tarballs %s " % (",".join(new_files))
                        submit_cmd_str += " --tarball-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in new_files) )))
                        submit_cmd_str += " --outdir %s.%s.dir" % (pipeline, donor_tumor)
                        submit_cmd_str += " --key %s " % (args.keyfile)
                        submit_cmd_str += " --upload-url %s" % (args.upload_url)
                        submit_cmd_str += " --study-refname-override %s" % (args.study)
                        submit_cmd_str += " --workflow-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                        submit_cmd_str += " --workflow-name '%s'" % args.pipeline_name
                        submit_cmd_str += " --workflow-version '%s'" % args.pipeline_version
                        submit_cmd_str += " --vm-instance-type '%s'" % args.vm_instance_type
                        submit_cmd_str += " --vm-instance-cores %s" % args.vm_instance_cores
                        submit_cmd_str += " --vm-instance-mem-gb %s" % args.vm_instance_mem_gb
                        submit_cmd_str += " --workflow-file-subset %s" % (pipeline)
                        submit_cmd_str += " --timing-metrics-json %s" % (timing_json)
                        submit_cmd_str += " --related-file-subset-uuids %s" % (",".join(related_uuids))
                        submit_cmd_str += " --uuid %s" % (uuid_map[pipeline][donor])
                        #submit_cmd_str += " --skip-upload"

                    handle.write(string.Template("""#!/bin/bash
set -ex
${PREP}
${SUBMIT}
echo $$? > $$0.submitted
#pushd ${SUBMIT_DIR}
#gtupload -v -c ${KEY} -u ./manifest.xml
#ECODE=$$?
#popd
#echo $$ECODE > $$0.uploaded
""").substitute(PREP=prep_cmd_str, SUBMIT=submit_cmd_str,
                            SUBMIT_DIR=os.path.join(os.path.abspath(args.workdir), "vcf", pipeline + "." + donor_tumor + ".dir", uuid_map[pipeline][donor] ),
                            KEY=args.keyfile
                    ) )

Example #14

Show file

File: pcawg_wf_gen.py Project: Jeltje/pcawg_tools

def run_uploadprep(args):
    
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)
    doc = from_url(args.out_base)
    file_map = {
        'broad' : {},
        'muse' : {}
    }

    syn = synapseclient.Synapse()
    syn.login()

    wl_map = {}
    job_map = {}
    for ent in synqueue.listAssignments(syn, list_all=True, **config):
        wl_map[ent['id']] = ent['meta']
    
    for id, entry in doc.filter():
        donor = None    
        if 'tags' in entry:
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'donor':
                    donor = tmp[1]
        if donor is not None: 
            if donor not in job_map:
                job_map[donor] = {}
            if 'job' in entry and 'job_metrics' in entry['job']:
                print entry['name']
                for met in entry['job']['job_metrics']:
                    if met['name'] == 'runtime_seconds':
                        job_map[donor][entry['name']] = {"tool_id" : entry['job']['tool_id'], "runtime_seconds" : met['raw_value']}
            if entry.get('visible', False) and entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
                pipeline = None
                method = None
                call_type = None
                variant_type = None
                if entry['name'].split('.')[0] in ['MUSE_1']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (donor, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(args.workdir, file_name)

                    shutil.copy(src_file, dst_file)
                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip -c %s > %s.gz" % (dst_file, dst_file), shell=True )
                        dst_file = dst_file + ".gz"

                    subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True)
                    shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file))
                    subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True)
                    subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True)

                    if donor not in file_map[pipeline]:
                        file_map[pipeline][donor] = []

                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][donor].append(input_file)

    for pipeline, donors in file_map.items():
        for donor, files in donors.items():
            if donor in wl_map:
                """
                with open( os.path.join(args.workdir, "%s.%s.pipeline.json" %(pipeline, donor)), "w" ) as handle:
                    handle.write(json.dumps( {"pipeline_src" : args.pipeline_src, "pipeline_version" : args.pipeline_version} ))
                """
                timing_json = os.path.join(args.workdir, "%s.%s.timing.json" %(pipeline, donor))
                with open( timing_json, "w" ) as handle:
                    handle.write(json.dumps( job_map[donor] ) )
                    
            
                with open( os.path.join(args.workdir, "%s.%s.sh" %(pipeline, donor)), "w" ) as handle:
                    input_file = os.path.basename(dst_file)
                    urls = [
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Normal_WGS_alignment_GNOS_repos'], wl_map[donor]['Normal_WGS_alignment_GNOS_analysis_ID']),
                        "%scghub/metadata/analysisFull/%s" % (wl_map[donor]['Tumour_WGS_alignment_GNOS_repos'], wl_map[donor]['Tumour_WGS_alignment_GNOS_analysis_IDs'])
                    ]
                    cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl"
                    cmd_str += " --metadata-urls %s" % (",".join(urls))
                    cmd_str += " --vcfs %s " % (",".join(files))
                    cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                    cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                    cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                    cmd_str += " --outdir %s.%s.dir" % (pipeline, donor)
                    cmd_str += " --key %s " % (args.keyfile)
                    cmd_str += " --upload-url %s" % (args.upload_url)
                    cmd_str += " --study-refname-override tcga_pancancer_vcf_test"
                    cmd_str += " --workflow-src-url '%s'" % args.pipeline_src
                    cmd_str += " --timing-metrics-json %s" % (timing_json)
                    handle.write("#!/bin/bash\n%s\n" % (cmd_str) )