def create_index_file(bam_filename, bam_dxlink):
    """Create Index file.
    Sorts BAM if needed
    """
    print "Creating Index file."
    index_filename = "{bam}.bai".format(bam=bam_filename)
    cmd_index = ['samtools', 'index', bam_filename]
    sorted_filename = bam_filename
    try:
        run_cmd(cmd_index)
    except NotIndexedException:
        print "Sorting BAM"
        sorted_filename = bam_filename[:-4] + '.sorted.bam'
        cmd_sort = [
            'samtools',
            'sort',
            bam_filename,
            bam_filename[:-4] + '.sorted']
        run_cmd(cmd_sort)
        print "Indexing BAM"
        index_cmd = ['samtools', 'index', sorted_filename]
        index_filename = "{sorted_bam_name}.bai".format(
            sorted_bam_name=sorted_filename)
        run_cmd(index_cmd)
    finally:
        index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename))
        aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename))
        return aligned_sorted_bam, index_file_link
def create_final_set_of_peak_calls(job_inputs):
    replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']]
    pseudo_replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files']]
    pooled_pseudo_replicate_idr_prefix = job_inputs['pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '')

    (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds(replicate_idr_prefixes,
                                                                                    pseudo_replicate_idr_prefixes,
                                                                                    pooled_pseudo_replicate_idr_prefix,
                                                                                    job_inputs['replicate_peaks_threshold'],
                                                                                    job_inputs['pseudo_replicate_peaks_threshold'],
                                                                                    job_inputs['pooled_pseudo_replicate_peaks_threshold'])
    max_numPeaks_Rep = max(num_peaks_each_rep)

    pooled_replicates_peaks_fn = download_and_gunzip_file(job_inputs['pooled_replicate_peaks_file'])
    coi = {'signal.value': 7, 'p.value': 8, 'q.value': 9}[job_inputs['ranking_measure']]
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0)
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    conservative_result = dxpy.upload_local_file('{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix']))
    optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(job_inputs['output_prefix']))

    return {'conservative_peak_calls': dxpy.dxlink(conservative_result),
            'optimal_peak_calls': dxpy.dxlink(optimal_result),
            'num_peaks_each_rep': num_peaks_each_rep,
            'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep,
            'num_peaks_pooled_pseudo_rep': numPeaks_Rep0}
Example #3
0
def test_alignment_count(applet_id, project_id, folder, tmpdir):
    """Run BWA on a FASTQ file and verify that the number of
    alignments produced is correct.
    """

    # Recall that applet_id is set in the associated conftest.py, which either
    # gets it from the command line or builds the applet and retrieves its id.

    # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath.
    # It's strpath property just returns a string.

    applet = dxpy.DXApplet(applet_id)
    input_dict = {"fastq": dxpy.dxlink(SAMPLE_FASTQ),
                  "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)}

    job = applet.run(input_dict, instance_type="mem1_ssd1_x16",
                         folder=folder, project=project_id)

    job.wait_on_done()
    
    output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"])
    local_filename = os.path.join(tmpdir.strpath, "test.bam")
    dxpy.download_dxfile(output_bam_dxfile.get_id(),
                         local_filename)
    count_alignments_cmd = "samtools view {bam} | wc -l".format(
        bam=local_filename)
    num_alignments = int(subprocess.check_output(count_alignments_cmd,
                                                 shell=True))
    assert num_alignments == 1951476
Example #4
0
def run_test_analyses(project, folder, workflow, find_test_data):
    # test cases: one or more named input hashes to run the workflow with
    test_inputs = {
        "21+Y": {
            "construct.reference_genome": dxpy.dxlink(find_test_data("hs37d5.fa.gz").get_id()),
            "construct.reference_variants": dxpy.dxlink(find_test_data("ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz").get_id()),
            "construct.reference_contigs": ["21", "Y"],
            "map.reads": dxpy.dxlink(find_test_data("HS1011_unitigs_Y.fastq.gz").get_id())
        }
    }

    # The tests might only need smaller instance types than the applet
    # defaults (reduces cost of running tests).
    stage_instance_types = {
        "construct": "mem3_ssd1_x8",
        "index": "mem3_ssd1_x8",
        "map": "mem3_ssd1_x8"
    }

    git_revision = workflow.describe(incl_properties=True)["properties"]["git_revision"]
    analyses = []
    for test_name, test_input in test_inputs.iteritems():
        test_folder = os.path.join(folder, test_name)
        project.new_folder(test_folder, parents=True)
        analyses.append(workflow.run(test_input, project=project.get_id(), folder=test_folder,
                                     stage_instance_types=stage_instance_types,
                                     delay_workspace_destruction=True,
                                     name="dxvg {} {}".format(test_name, git_revision)))
    return analyses
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root):
    '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8'''

    print "* coverage(): Retrieve context files and index..."
    CpG_context = 'output/CpG_context_%s.txt' % target_root
    CHG_context = 'output/CHG_context_%s.txt' % target_root
    CHH_context = 'output/CHH_context_%s.txt' % target_root
    run_cmd('mkdir -p output/')
    dxpy.download_dxfile(CpG_context_dxlink, CpG_context)
    dxpy.download_dxfile(CHG_context_dxlink, CHG_context)
    dxpy.download_dxfile(CHH_context_dxlink, CHH_context)
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)

    print "* coverage(): Uncompress index..."
    run_cmd('tar -zxf ' + dme_ix)

    (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context)
    
    print "* coverage(): Storing coverage results..."
    cx_report_dxfile = dxpy.upload_local_file(cx_report)
    bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz)

    print "* coverage(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        "cx_report_dxlink":     dxpy.dxlink(cx_report_dxfile),
        "bedgraph_gz_dxlink":   dxpy.dxlink(bedgraph_gz_dxfile)
    }
Example #6
0
    def test_paired_with_contam(self):
        bed_file = dxpy.find_one_data_object(
            name="hg19_GRCh37_Feb2009_RefSeq.bed")['id']
        mappings = dxpy.find_one_data_object(
            name="SRR018256_paired_RNA_Mappings",
            typename="LetterMappings")['id']
        contam_contig = dxpy.find_one_data_object(name="human rRNA",
                                                  typename="ContigSet")['id']
        reads = dxpy.find_one_data_object(name="SRR018256_reads",
                                          typename="LetterReads")['id']
        if bed_file == None:
            print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed.  Please upload it"
            return False
        if mappings == None:
            print "Cannot find Mappings.  Please upload them"
            return False
        if contam_contig == None:
            print "Cannot find human rRNA.  Please upload it"
            return False
        if reads == None:
            print "Cannot find SRR018256_reads.  Please upload it"
            return False

        input = {
            'rna_seq_mappings': dxpy.dxlink(mappings),
            'bed_file': dxpy.dxlink(bed_file),
            'contaminants': [dxpy.dxlink(contam_contig)],
            'original_reads': [dxpy.dxlink(reads)]
        }

        print "Running program with", input
        job = self.program.run(input)
        print "launched test_paired_with_contam ", job.get_id()
Example #7
0
    def test_paired_with_contam(self):
        bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id']
        mappings = dxpy.find_one_data_object(name="SRR018256_paired_RNA_Mappings", typename="LetterMappings")['id']
        contam_contig = dxpy.find_one_data_object(name="human rRNA", typename="ContigSet")['id']
        reads = dxpy.find_one_data_object(name="SRR018256_reads", typename="LetterReads")['id']
        if bed_file == None:
            print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed.  Please upload it"
            return False
        if mappings == None:
            print "Cannot find Mappings.  Please upload them"
            return False
        if contam_contig == None:
            print "Cannot find human rRNA.  Please upload it"
            return False
        if reads == None:
            print "Cannot find SRR018256_reads.  Please upload it"
            return False

        input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 
                  'bed_file': dxpy.dxlink(bed_file),
                  'contaminants': [dxpy.dxlink(contam_contig)],
                  'original_reads': [dxpy.dxlink(reads)] }

        print "Running program with", input
        job = self.program.run(input)
        print "launched test_paired_with_contam ", job.get_id()
Example #8
0
    def _format_data_file(self, df: DataFile) -> dict:
        if isinstance(df.localizer, UrlLocalizer):
            ul = cast(UrlLocalizer, df.localizer)
            if ul.url.startswith("dx://"):
                return dxpy.dxlink(*ul.url[5:].split(":"))

        file_name = df.local_path.name

        existing_files = list(dxpy.find_data_objects(
            classname="file",
            state="closed",
            name=file_name,
            project=self._project_id,
            folder=self._folder,
            recurse=False
        ))

        if not existing_files:
            # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for
            #  parallelization
            return dxpy.dxlink(dxpy.upload_local_file(
                str(df.path),
                name=file_name,
                project=self._project_id,
                folder=self._folder,
                parents=True,
                wait_on_close=True
            ))
        elif len(existing_files) == 1:
            return dxpy.dxlink(existing_files[0]["id"], self._project_id)
        else:
            raise RuntimeError(
                f"Multiple files with name {file_name} found in "
                f"{self._project_id}:{self._folder}"
            )
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"])


    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name);
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name);
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'],
                                  [".bam", ".BAM", ".sam", ".SAM"])

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name)
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name)
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
Example #11
0
def _type_convert_primitive(val, klass):
    retval = None
    ref_files = []
    if klass == 'string':
        retval = val
    elif klass == 'int':
        retval = int(val)
    elif klass == "boolean":
        retval = bool(val)
    elif klass == 'float':
        retval = float(val)
    elif klass == 'hash':
        retval = json.loads(val)
    elif klass == 'file':
        if val.startswith("project-"):
            val = val.split(":")
            retval = dxpy.dxlink(object_id=val[1], project_id=val[0])
            ref_files.append(retval)
        elif val.startswith("file-"):
            retval = dxpy.dxlink(val)
            ref_files.append(retval)
        else:
            raise Exception(
                "Malformed file {}, must start with 'file-' or 'project-'".
                format(val))
    else:
        raise Exception("class {} not currently supported".format(klass))
    return retval, ref_files
Example #12
0
    def setUpClass(cls):
        if RUN_JOB_ON_DX:
            if not project_name:
                print "'PROJ_NAME' environment variable must be defined!"
                sys.exit(1)
            working_project_id = dxpy.find_one_project(more_ok=False,
                                                       name=project_name)["id"]
            run_args = {}
            run_args["project"] = working_project_id
            run_args["name"] = "vcfscope-measure on chr21"
            run_args["folder"] = "/purge/" + app_name
            input_hash = {}
            input_hash["vcfgz"] = dxpy.dxlink("file-BkkjFkj098Gb2jZ1Yx533JFv",
                                              project_id)
            input_hash["bam"] = dxpy.dxlink("file-Bkkjj5Q098Gkvkb3Xx5Pxj1J",
                                            project_id)
            input_hash["bai"] = dxpy.dxlink("file-Bkkjj5Q098GzYx2bG5YJ3z34",
                                            project_id)
            input_hash["region"] = dxpy.dxlink("file-Bkkj22Q098Gz5yK1Q955G5gX",
                                               project_id)

            app = dxpy.DXApp(name=app_name, alias="9.9.7")
            cls.job = app.run(input_hash, **run_args)

        else:
            job_id = "job-F1JpY9Q0pVj0BgpYBp14f31Q"
            cls.job = dxpy.DXJob(job_id)

        cls.job.wait_on_done()
Example #13
0
    def setUp(self):
        setUpTempProjects(self)
        self.dxapplet = dxpy.DXApplet()
        self.dxapplet.new(name="identity-record",
                          dxapi="1.04",
                          inputSpec=[{"name": "record", "class": "record"}
                                     ],
                          outputSpec=[{"name": "record", "class": "record"}],
                          runSpec={"code": '''
@dxpy.entry_point('main')
def main(record):
    return {'record': record}''',
                                   "interpreter": "python2.7"})
        dxrecord = dxpy.new_dxrecord(name='workflowname',
                                     details={"stages": [{"job": None,
                                                          "inputs": {},
                                                          "app": dxpy.dxlink(self.dxapplet),
                                                          "id": "stage0-id"
                                                          },
                                                         {"job": None,
                                                          "inputs": {"record": {"connectedTo": {"output": "record",
                                                                                                "stage": "stage0-id"}
                                                                                }
                                                                     },
                                                          "app": dxpy.dxlink(self.dxapplet),
                                                          "id": "stage1-id"
                                                          }],
                                              "version": 5},
                                     types=['pipeline'])
        self.workflow = dxpy.DXWorkflow(dxrecord.get_id())
        self.closedrecord = dxpy.new_dxrecord(name='a record')
        self.closedrecord.close()
Example #14
0
def makeInputsBwa():
    try:
        contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id'])
        reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id'])
    except StopIteration:
        raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them")

    genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True)
    contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)}
    print "Running fasta_contigset_importer with", contigset_importer_input
    job = contigset_importer.run(contigset_importer_input)
    job.wait_on_done()
    contig_set = job.describe()["output"]["contig_set"]

    left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True)
    right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True)
    #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True)
    #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True)

    reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)}
    print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input
    job = reads_importer.run(reads_importer_input)
    job.wait_on_done()
    reads = job.describe()["output"]["reads"]

    return {"reads": [reads] * 3, "reference": contig_set}
def make_indexed_reference( ref_ID ):

    run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID)
    ref_details = dxpy.DXRecord(ref_ID).get_details()
    ref_name = dxpy.DXRecord(ref_ID).describe()['name']

    # call bowtie2-build
    run_shell("bowtie2-build reference.fasta indexed_ref")
    # package it into an archive for uploading
    run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*")

    indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True)

    indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for Bowtie2)",
                                           types=["BowtieLetterContigSetV2"],
                                           details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()),
                                                    'original_contigset': dxpy.dxlink(ref_ID)})
    indexed_ref_record.close()

    '''
    # TODO: dxpy project workspace convenience functions
    if "projectWorkspace" in job:
        indexed_ref_record.clone(job["projectWorkspace"])
    '''

    return indexed_ref_record.get_id()
def make_indexed_reference(ref_ID):

    run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID)
    ref_details = dxpy.DXRecord(ref_ID).get_details()
    ref_name = dxpy.DXRecord(ref_ID).describe()['name']

    # call bowtie2-build
    run_shell("bowtie2-build reference.fasta indexed_ref")
    # package it into an archive for uploading
    run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*")

    indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz",
                                                hidden=True,
                                                wait_on_close=True)

    indexed_ref_record = dxpy.new_dxrecord(
        name=ref_name + " (indexed for Bowtie2)",
        types=["BowtieLetterContigSetV2"],
        details={
            'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()),
            'original_contigset': dxpy.dxlink(ref_ID)
        })
    indexed_ref_record.close()
    '''
    # TODO: dxpy project workspace convenience functions
    if "projectWorkspace" in job:
        indexed_ref_record.clone(job["projectWorkspace"])
    '''

    return indexed_ref_record.get_id()
Example #17
0
    def build(incl_map):
        nm = "vg_construct_index_map" if incl_map else "vg_construct_index"
        wf = dxpy.new_dxworkflow(title=nm,
                                 name=nm,
                                 description=nm,
                                 project=project.get_id(),
                                 folder=folder,
                                 properties={"git_revision": git_revision})

        construct_applet = find_applet("vg_construct")
        construct_input = {
        }
        construct_stage_id = wf.add_stage(construct_applet, stage_input=construct_input, name="construct")

        index_input = {
            "vg_tar": dxpy.dxlink({"stage": construct_stage_id, "outputField": "vg_tar"})
        }
        index_stage_id = wf.add_stage(find_applet("vg_index"), stage_input=index_input, name="index")

        if incl_map:
            map_input = {
                "vg_indexed_tar": dxpy.dxlink({"stage": index_stage_id, "outputField": "vg_indexed_tar"})
            }
            map_stage_id = wf.add_stage(find_applet("vg_map"), stage_input=map_input, name="map")

        return wf
Example #18
0
def test_alignment_count(applet_id, project_id, folder, tmpdir):
    """Run BWA on a FASTQ file and verify that the number of
    alignments produced is correct.
    """

    # Recall that applet_id is set in the associated conftest.py, which either
    # gets it from the command line or builds the applet and retrieves its id.

    # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath.
    # It's strpath property just returns a string.

    applet = dxpy.DXApplet(applet_id)
    input_dict = {
        "fastq": dxpy.dxlink(SAMPLE_FASTQ),
        "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)
    }

    job = applet.run(input_dict,
                     instance_type="mem1_ssd1_x16",
                     folder=folder,
                     project=project_id)

    job.wait_on_done()

    output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"])
    local_filename = os.path.join(tmpdir.strpath, "test.bam")
    dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename)
    count_alignments_cmd = "samtools view {bam} | wc -l".format(
        bam=local_filename)
    num_alignments = int(
        subprocess.check_output(count_alignments_cmd, shell=True))
    assert num_alignments == 1951476
Example #19
0
def create_index_file(bam_filename, bam_dxlink):
    """Create Index file.
    Sorts BAM if needed
    """
    print("Creating Index file.")
    index_filename = "{bam}.bai".format(bam=bam_filename)
    cmd_index = ['samtools', 'index', bam_filename]
    sorted_filename = bam_filename
    try:
        run_cmd(cmd_index)
    except NotIndexedException:
        print("Sorting BAM")
        sorted_filename = bam_filename[:-4] + '.sorted.bam'
        cmd_sort = [
            'samtools',
            'sort',
            bam_filename,
            bam_filename[:-4] + '.sorted']
        run_cmd(cmd_sort)
        print("Indexing BAM")
        index_cmd = ['samtools', 'index', sorted_filename]
        index_filename = "{sorted_bam_name}.bai".format(
            sorted_bam_name=sorted_filename)
        run_cmd(index_cmd)
    finally:
        index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename))
        aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename))
        return aligned_sorted_bam, index_file_link
def postprocess(bam_files,
                report_files,
                bam_root,
                nthreads=8,
                use_cat=False,
                use_sort=False):
    # This is the "gather" phase which aggregates and performs any
    # additional computation after the "map" (and therefore after all
    # the "process") jobs are done.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.debug("** In Postprocess - refactored dme-merge-bams - *")

    versions = "Unknown"
    if os.path.isfile(VERSION_SCRIPT):
        try:
            versions = subprocess.check_output(
                shlex.split(
                    'tool_versions.py --dxjson dnanexus-executable.json'))
        except:
            pass

    merged_bam = merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads)

    (merged_report, report_file_names) = merge_reports(bam_root, report_files,
                                                       bam_root)

    (merged_qc, nreads, metadata) = merge_qc(bam_root, report_file_names)

    props = {
        'SW': versions,
        'reads': nreads,
    }
    output = {
        "bam_techrep":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_bam,
                                   details=metadata,
                                   properties=props)),
        "bam_techrep_qc":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_qc,
                                   details=metadata,
                                   properties={'SW': versions})),
        "map_techrep":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_report,
                                   details=metadata,
                                   properties={'SW': versions})),
        "reads":
        nreads,
        "metadata":
        json.dumps(metadata)
    }
    return output
Example #21
0
def main(input_bam, paired_end):

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    subprocess.check_output('ls -l', shell=True)

    # ===================
    # Create tagAlign file
    # ===================
    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename),
        "gzip -cn"],
        outfile=final_TA_filename)

    subprocess.check_output('ls -l', shell=True)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        command = \
            "samtools sort -@ %d -n %s %s" \
            % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(command)
        subprocess.check_call(shlex.split(command))

        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename),
            "gzip -cn"],
            outfile=final_BEDPE_filename)

    subprocess.check_output('ls -l', shell=True)

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    output = {}
    output["tagAlign_file"] = dxpy.dxlink(tagAlign_file)
    if paired_end:
        output["BEDPE_file"] = dxpy.dxlink(BEDPE_file)

    return output
def main(input_SAM, deviations=None, histogram_width=None, min_percent=None, metric_acc_level=None, ref=None, is_sorted=None, stop_after=None):

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_SAM, "input")
    if ref != None:
        dxpy.download_dxfile(ref, "ref.fa")


    command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar"
    command += " INPUT=input"
    command += " OUTPUT=insert_distribution.txt"
    command += " HISTOGRAM_FILE=histogram.pdf"
    if deviations != None:
        command += " DEVIATIONS="+str(deviations)
    if histogram_width != None:
        command += " HISTOGRAM_WIDTH="+str(histogram_width)
    if min_percent != None:
        command += " MINIMUM_PCT="+str(histogram_width)
    if metric_acc_level != None:
        for level in metric_acc_level:
            command += " METRIC_ACCUMULATION_LEVEL="+str(level)
    if ref != None:
        command += " REFERENCE_SEQUENCE=ref.fa"
    if is_sorted != None:
        if is_sorted:
            command += " ASSUME_SORTED=true"
        else:
            command += " ASSUME_SORTED=false"
    if stop_after != None:
        command += " STOP_AFTER="+str(stop_after)

    print "Executing:"
    print command

    # CALL the command here:
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    histogram = dxpy.upload_local_file("histogram.pdf")
    histogram.rename(dxpy.DXFile(input_SAM).describe()['name']+"_histogram.pdf")
    output_dist = dxpy.upload_local_file("insert_distribution.txt")
    output_dist.rename(dxpy.DXFile(input_SAM).describe()['name']+"_insert_dist.txt")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["histogram"] = dxpy.dxlink(histogram)
    output["output"] = dxpy.dxlink(output_dist)

    return output
Example #23
0
def main(**kwargs):

    dxpy.download_folder(DCC_CREDENTIALS_PROJECT,
                         '.',
                         folder=DCC_CREDENTIALS_FOLDER)
    if 'key' in kwargs:
        key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')])
    else:
        key = dxpy.api.system_whoami()['id']
    key_tuple = common.processkey(key, KEYFILE)
    if not key_tuple:
        logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE))
        raise PortalCredentialsError("Supply a valid keypair ID")
    authid, authpw, server = key_tuple
    if 'url' in kwargs:
        server = kwargs.pop('url')
    keypair = (authid, authpw)

    tokens = ['python3 checkfiles.py']
    for k, v in kwargs.iteritems():
        if isinstance(v, bool):
            if v:
                tokens.append("--" + k.replace('_', '-'))
            continue
        if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int):
            tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)]))

    if 'dx_file' in kwargs:
        dxfile = dxpy.DXFile(kwargs.get('dx_file'))
        local_file = dxpy.download_dxfile(dxfile, dxfile.name)
        tokens.append("--local-file %s" % (dxfile.name))

    # this is just to get a command string to print that has no secrets
    tokens_safe = deepcopy(tokens)
    tokens_safe.append("--username %s --password %s" %
                       ("." * len(authid), "." * len(authpw)))
    tokens_safe.append(server)
    logger.info(' '.join(tokens_safe))

    tokens.append("--username %s --password %s" % (authid, authpw))
    # this needs to be the last token
    tokens.append(server)

    checkfiles_command = ' '.join(tokens)
    subprocess.check_call(shlex.split(checkfiles_command))

    output = {}
    outfilename = kwargs.get('out')
    errfilename = kwargs.get('err')
    if outfilename:
        out = dxpy.upload_local_file(outfilename)
        output.update({'out': dxpy.dxlink(out)})
    if errfilename:
        err = dxpy.upload_local_file(errfilename)
        output.update({'err': dxpy.dxlink(err)})

    return output
Example #24
0
def get_dxfile(filePath, project=None):
    '''Returns dxfile object.'''
    dxfile = None
    #if filePath.find("$dnanexus_link") != -1:
    #    filePath = filePath.split(' ')[1]
    #    filePath = filePath.replace("'","").replace('"','').replace("}","").replace("{","")
    try:
        dxlink = json.loads(filePath.strip("'"))
    except:
        dxlink = None

    if project != None:

        try:
            if dxlink != None:
                dxfile = dxpy.get_handler(dxlink, project=project)
            else:
                dxfile = dxpy.get_handler(filePath, project=project)
        except:
            try:
                dxlink = dxpy.dxlink(filePath, project=project)
                dxfile = dxpy.get_handler(dxlink)
            except:
                try:
                    proj_id = env_get_current_project_id()
                    dxfile = dxpy.DXFile(filePath, project=proj_id)
                except:
                    sys.stderr.write('ERROR: unable to find file "' +
                                     filePath + '": \n')
                    sys.exit(0)  # Do not error on tool run in dx script

    else:

        try:
            if dxlink != None:
                dxfile = dxpy.get_handler(dxlink)
            else:
                dxfile = dxpy.get_handler(filePath)
        except:
            try:
                dxlink = dxpy.dxlink(filePath)
                dxfile = dxpy.get_handler(dxlink)
            except:
                try:
                    proj_id = env_get_current_project_id()
                    dxfile = dxpy.DXFile(filePath, project=proj_id)
                except:
                    sys.stderr.write('ERROR: unable to find file "' +
                                     filePath + '": \n')
                    sys.exit(0)  # Do not error on tool run in dx script

    if dxfile == None:
        sys.stderr.write('ERROR: unable to find file "' + filePath + '": \n')
        sys.exit(0)  # Do not error on tool run in dx script

    return dxfile
Example #25
0
def pooled(files):
    pool_applet = dxpy.find_one_data_object(
        classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID,
        zero_ok=False, more_ok=False, return_handler=True)
    logger.debug('input files:%s' %(files))
    logger.debug('input file ids:%s' %([dxf.get_id() for dxf in files]))
    logger.debug('input files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in files]))
    pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in files]})
    pooled_file = pool_subjob.get_output_ref("pooled")
    return pooled_file
def get_dxfile(filePath,project=None):
    '''Returns dxfile object.'''
    dxfile = None
    #if filePath.find("$dnanexus_link") != -1:
    #    filePath = filePath.split(' ')[1]
    #    filePath = filePath.replace("'","").replace('"','').replace("}","").replace("{","")
    try:
        dxlink = json.loads(filePath.strip("'"))
    except:
        dxlink = None
        
    if project != None:
        
        try:
            if dxlink != None:
                dxfile = dxpy.get_handler(dxlink,project=project)
            else:
                dxfile = dxpy.get_handler(filePath,project=project)
        except:
            try:
                dxlink = dxpy.dxlink(filePath,project=project)
                dxfile = dxpy.get_handler(dxlink)
            except:
                try:
                    proj_id = env_get_current_project_id()
                    dxfile = dxpy.DXFile(filePath,project=proj_id)
                except:
                    sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n')
                    sys.exit(0)  # Do not error on tool run in dx script 
    
    else:
    
        try:
            if dxlink != None:
                dxfile = dxpy.get_handler(dxlink)
            else:
                dxfile = dxpy.get_handler(filePath)
        except:
            try:
                dxlink = dxpy.dxlink(filePath)
                dxfile = dxpy.get_handler(dxlink)
            except:
                try:
                    proj_id = env_get_current_project_id()
                    dxfile = dxpy.DXFile(filePath,project=proj_id)
                except:
                    sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n')
                    sys.exit(0)  # Do not error on tool run in dx script 

    if dxfile == None:
        sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n')
        sys.exit(0)  # Do not error on tool run in dx script 
    
    return dxfile
Example #27
0
def main(cons1, cons2, outroot, xchr=True, recalnums=1, skip=20, timemax=7500000.0):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    cons1 = dxpy.DXFile(cons1)
    cons2 = dxpy.DXFile(cons2)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(cons1.get_id(), "cons1")
    dxpy.download_dxfile(cons2.get_id(), "cons2")
    outname1 = outroot + '.psmcfa'
    outname2 = outroot + '.psmc'

    # Fill in your application code here.
    #create the psmcfa file
    createPSMCfa('cons1', 'cons2', outname1, skip)
    print 'Generated the PSMC fasta file.'
    sys.stdout.flush()
    #run psmc the first time
    subprocess.check_call(['psmc', '-t', '15', '-r', '5', '-p', "4+25*2+4+6", '-o', 'test.psmc', outname1])
    print 'Done with first run of PSMC.'
    sys.stdout.flush()
    #run the recal script and run psmc again.
    while (recalnums > 1):
        (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr)
        subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', 'test.psmc', outname1])
        recalnums -= 1
        print 'Recals left', recalnums
        sys.stdout.flush()
    (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr)
    subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname2, outname1])
    print 'Finished final recalibration run.'

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    outfile1 = dxpy.upload_local_file(outname1);
    outfile2 = dxpy.upload_local_file(outname2);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.
    output = {}
    output["outfile1"] = dxpy.dxlink(outfile1)
    output["outfile2"] = dxpy.dxlink(outfile2)

    return output
Example #28
0
def create_final_set_of_peak_calls(job_inputs):
    replicate_idr_prefixes = [
        r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']
    ]
    pseudo_replicate_idr_prefixes = [
        r.replace('.tar.gz', '')
        for r in job_inputs['pseudo_replicate_idr_files']
    ]
    pooled_pseudo_replicate_idr_prefix = job_inputs[
        'pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '')

    (num_peaks_each_rep, num_peaks_each_pseudo_rep,
     numPeaks_Rep0) = get_thresholds(
         replicate_idr_prefixes, pseudo_replicate_idr_prefixes,
         pooled_pseudo_replicate_idr_prefix,
         job_inputs['replicate_peaks_threshold'],
         job_inputs['pseudo_replicate_peaks_threshold'],
         job_inputs['pooled_pseudo_replicate_peaks_threshold'])
    max_numPeaks_Rep = max(num_peaks_each_rep)

    pooled_replicates_peaks_fn = download_and_gunzip_file(
        job_inputs['pooled_replicate_peaks_file'])
    coi = {
        'signal.value': 7,
        'p.value': 8,
        'q.value': 9
    }[job_inputs['ranking_measure']]
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(
        coi, pooled_replicates_peaks_fn, max_numPeaks_Rep,
        job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0)
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(
        coi, pooled_replicates_peaks_fn, opt_thresh,
        job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    conservative_result = dxpy.upload_local_file(
        '{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix']))
    optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(
        job_inputs['output_prefix']))

    return {
        'conservative_peak_calls': dxpy.dxlink(conservative_result),
        'optimal_peak_calls': dxpy.dxlink(optimal_result),
        'num_peaks_each_rep': num_peaks_each_rep,
        'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep,
        'num_peaks_pooled_pseudo_rep': numPeaks_Rep0
    }
def process(scattered_input, dme_ix, ncpus, reads_root):
    # Fill in code here to process the input and create output.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    dme_ix = dxpy.DXFile(dme_ix)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(dme_ix.get_id(), "index.tgz")
    fq = dxpy.DXFile(scattered_input)
    name = fq.describe()['name']
    dxpy.download_dxfile(fq.get_id(), name)
    bam_root = name + '_techrep'

    logger.info("* === Calling DNAnexus and ENCODE independent script... ===")
    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug(subprocess.check_output(shlex.split('head %s' % name)))
    if os.path.isfile(ALIGN_SCRIPT):
        logger.debug("** Executable %s exists" % ALIGN_SCRIPT)
    else:
        logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT)
        exit(1)
    align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name,
                                                    str(ncpus), bam_root)
    logger.debug('** command line: %s' % align_cmd)
    map_out = subprocess.check_output(shlex.split(align_cmd))
    logger.info("* === Returned from dname_align_se  ===")

    # As always, you can choose not to return output if the
    # "postprocess" stage does not require any input, e.g. rows have
    # been added to a GTable that has been created in advance.  Just
    # make sure that the "postprocess" job does not run until all
    # "process" jobs have finished by making it wait for "map" to
    # finish using the depends_on argument (this is already done for
    # you in the invocation of the "postprocess" job in "main").

    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug("** OUTPUT DIR: %s" % os.listdir('output/'))

    os.rename(bam_root + '_bismark.bam', bam_root + '.bam')
    return {
        "bam_file":
        dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')),
        "report_file":
        dxpy.dxlink(
            dxpy.upload_local_file(bam_root + '_bismark_map_report.txt'))
    }
Example #30
0
def run_wg_build(project, folder, workflow, find_test_data, depends_on):
    wg_input = {
        "construct.reference_genome": dxpy.dxlink(find_test_data("hs37d5.fa.gz").get_id()),
        "construct.reference_variants": dxpy.dxlink(find_test_data("ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz").get_id()),
        "map.reads": dxpy.dxlink(find_test_data("HS1011.mag.gz").get_id())
    }

    git_revision = workflow.describe(incl_properties=True)["properties"]["git_revision"]
    test_folder = os.path.join(folder, "whole-genome")
    project.new_folder(test_folder, parents=True)
    return workflow.run(wg_input, project=project.get_id(), folder=test_folder,
                        depends_on=depends_on, priority="normal",
                        name="dxvg whole-genome {}".format(git_revision))
Example #31
0
def main(reads1, bwa_aln_params, bwa_version, samtools_version, reads2, reference_tar, key, debug):

	if debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)


	#for each input fastq decide if it's specified as an ENCODE file accession number (ENCFF*)


	reads1_files = [resolve_file(read, key) for read in reads1]
	if len(reads1_files) > 1:
		pool_applet = dxpy.find_one_data_object(
			classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID,
			zero_ok=False, more_ok=False, return_handler=True)
		logger.debug('reads1_files:%s' %(reads1_files))
		logger.debug('reads1_files ids:%s' %([dxf.get_id() for dxf in reads1_files]))
		logger.debug('reads1_files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in reads1_files]))
		pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in reads1_files]})
		reads1_file = pool_subjob.get_output_ref("pooled")
	else:
		reads1_file = reads1_files[0]
	reads2_file = resolve_file(reads2, key)
	reference_tar_file = resolve_file(reference_tar, key)

	logger.info('Resolved reads1 to %s', reads1_file)
	if reads2:
		logger.info('Resolved reads2 to %s', reads2_file)
	logger.info('Resolved reference_tar to %s', reference_tar_file)

	output = {}
	output.update({'reads1': reads1_file})
	if reads2:
		output.update({"reads2": reads2_file})
	output_json = {
		"reads1": reads1_file,
		"reference_tar": reference_tar_file,
		"bwa_aln_params": bwa_aln_params,
		"bwa_version": bwa_version,
		"samtools_version": samtools_version
	}
	if reads2:
		output_json.update({'reads2': reads2_file})
	output.update({'output_JSON': output_json})
	#logger.info('Exiting with output_JSON: %s' %(json.dumps(output)))
	#return {'output_JSON': json.dumps(output)}

	logger.info('Exiting with output: %s' %(output))
	return output
Example #32
0
def copy_files(fids, project_id, folder):
    new_fids = []
    for file_dict in fids:
        f = dxpy.DXFile(dxid=file_dict['id'], project=file_dict['project'])
        fn = f.describe()['name']

        # Check to see if file already exists.
        found_file = dxpy.find_one_data_object(classname='file', project=project_id, folder=folder, zero_ok=True, name=fn)
        if found_file is None:
            new_fids += [dxpy.dxlink(f.clone(project_id, folder))]
        else:
            new_fids += [dxpy.dxlink(found_file)]

    return new_fids
Example #33
0
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props):
    '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32'''

    (target_root,biorep_bam) = merge_bams(bam_set, 32)
    (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root)
    (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports)
    
    print "* merge_extract(): Retrieve and uncompress index..."
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)
    run_cmd('tar -zxf ' + dme_ix)

    # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage
    (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root)

    bismark_simple_extract(target_root, alignments, ncores)
    qc_metrics = bismark_qc_metrics(target_root, qc_metrics)

    print "* Retrieve split report..."
    append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc)
    run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True)

    # TODO: Is this even needed?  Currently we do to get the size!
    #if len(bam_set) > 1:  # Wouldn't need to do this unless there is a merge
    #    print "* merge_extract(): Storing biorep bam..."
    #    props_ex = props.copy()
    #    props_ex.update({ 'reads': str(reads) })
    #    biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True))
    #else:
    #    biorep_bam_dxlink = bam_set[0]

    print "* merge_extract(): Storing extraction results..."
    biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics)
    biorep_map_dxfile    = dxpy.upload_local_file(biorep_map,   properties=props,details=qc_metrics)
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    chrom_sizes_dxfile   = dxpy.upload_local_file('input/chrom.sizes')
    mbias_report_dxfile  = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics)
    CpG_context_dxfile   = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root))
    CHG_context_dxfile   = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root))
    CHH_context_dxfile   = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root))

    print "* merge_extract(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        #"biorep_bam_dxlink":    biorep_bam_dxfile,
        "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile),
        "biorep_map_dxlink":    dxpy.dxlink(biorep_map_dxfile),
        "CpG_context_dxlink":   dxpy.dxlink(CpG_context_dxfile),
        "CHG_context_dxlink":   dxpy.dxlink(CHG_context_dxfile),
        "CHH_context_dxlink":   dxpy.dxlink(CHH_context_dxfile),
        "split_report_dxlink":  dxpy.dxlink(split_report_dxfile),
        "chrom_sizes_dxlink":   dxpy.dxlink(chrom_sizes_dxfile),
        "mbias_report_dxlink":  dxpy.dxlink(mbias_report_dxfile),
        "target_root":          target_root,
        "qc_metrics":           qc_metrics
    }
Example #34
0
def find_file(filePath,project=None,verbose=False,multiple=False, recurse=True):
    '''Using a DX style file path, find the file.'''
    proj = project
    path = filePath
    fileName = filePath
    if filePath.find(':') != -1:
        proj, path = filePath.split(':', 1)
    if path.rfind('/') != -1:
        path, fileName = path.rsplit('/', 1)
    else:
        fileName = path
        path = '/'
    if proj == None:
        if verbose:
            print "ERROR: Don't know what project to use for '" + path + "'."
        return None
    if proj.find('project-') == 0:
        projId = proj
    else:
        projId = get_project(proj, level='VIEW').get_id()
    mode = 'exact'
    if filePath.find('*') or filePath.find('?'):
        mode = 'glob'
    fileDicts = list(dxpy.find_data_objects(classname='file', folder=path, name=fileName, recurse=recurse,
                                            name_mode=mode, project=projId, return_handler=False))

    if fileDicts == None or len(fileDicts) == 0:
        #print "- Found 0 files from '" + proj + ":" + filePath + "'."
        if verbose:
            print "ERROR: Failed to find '" + proj + ":" + filePath + "'."
        return None
    elif len(fileDicts) > 1 or multiple:
        #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'."
        if not multiple:
            if verbose:
                print "ERROR: Found "+str(len(fileDicts))+" files when expecting 1 '" + proj + ":" + filePath + "'."
            return None
        else:
            if verbose:
                print " Found "+str(len(fileDicts))+" files for '" + proj + ":" + filePath + "'."
        fids = []
        for fileDict in fileDicts:
            FILES[fileDict['id']] = dxpy.dxlink(fileDict)
            fids.append( fileDict['id'] )
        return fids
    else:
        #print "- FOUND '" + proj + ":" + filePath + "'."
        FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0])
        return fileDicts[0]['id']
Example #35
0
def pooled(files):
    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    logger.debug('input files:%s' % (files))
    logger.debug('input file ids:%s' % ([dxf.get_id() for dxf in files]))
    logger.debug('input files dxlinks:%s' %
                 ([dxpy.dxlink(dxf) for dxf in files]))
    pool_subjob = pool_applet.run(
        {"inputs": [dxpy.dxlink(dxf) for dxf in files]})
    pooled_file = pool_subjob.get_output_ref("pooled")
    return pooled_file
Example #36
0
def run_bwa_mem(sample, fastq_dict, mapper_app_dxid, ref_genome_index,
                project_id):
    '''
    Description: Maps sample fastq files to a reference genome
    Args:
        sample (dict) - sample[<barcode>] = [<fastq files>]
        mapper (dxid) 
        ref_genome (dxid)
    '''

    ## Stock DNAnexus BWA-MEM app
    #mapper_app_name = 'bwa_mem_fastq_read_mapper'
    #mapper_app_version = '1.5.0'
    #mapper_app = MapperApp(name=mapper_app_name, version=mapper_app_version)   # DXApp object

    dxpy.set_workspace_id(project_id)
    # Create dict to store mapper app inputs
    mapper_app = dxpy.DXApp(mapper_app_dxid)
    mapper_input = {
        'genomeindex_targz': dxpy.dxlink(ref_genome_index)
    }  # hg19 : file-B6qq53v2J35Qyg04XxG0000V

    # Add fastq files to mapper app input dict
    if len(fastq_dict) == 0:
        print 'Error: No fastq files listed for sample %s' % sample
        sys.exit()
    elif len(fastq_dict) == 1:
        mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1'])
    elif len(fastq_dict) == 2:
        mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1'])
        mapper_input['reads2_fastqgz'] = dxpy.dxlink(fastq_dict['2'])
    else:
        print 'Error: More than 2 fastq files passed for mapping sample %s' % sample
        sys.exit()
    print mapper_input

    mapper_job = mapper_app.run(mapper_input)
    mapper_output = {
        "BAM": {
            "job": mapper_job.get_id(),
            "field": "sorted_bam"
        },
        "BAI": {
            "job": mapper_job.get_id(),
            "field": "sorted_bai"
        }
    }
    return mapper_output
def process(scattered_input, dme_ix, ncpus, reads_root):
    # Fill in code here to process the input and create output.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    dme_ix = dxpy.DXFile(dme_ix)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(dme_ix.get_id(), "index.tgz")
    fq = dxpy.DXFile(scattered_input)
    name = fq.describe()['name']
    dxpy.download_dxfile(fq.get_id(), name)
    bam_root = name + '_techrep'

    logger.info("* === Calling DNAnexus and ENCODE independent script... ===")
    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug(subprocess.check_output(shlex.split('head %s' % name)))
    if os.path.isfile(ALIGN_SCRIPT):
        logger.debug("** Executable %s exists" % ALIGN_SCRIPT)
    else:
        logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT)
        exit(1)
    align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root)
    logger.debug('** command line: %s' % align_cmd)
    map_out = subprocess.check_output(shlex.split(align_cmd))
    logger.info("* === Returned from dname_align_se  ===")

    # As always, you can choose not to return output if the
    # "postprocess" stage does not require any input, e.g. rows have
    # been added to a GTable that has been created in advance.  Just
    # make sure that the "postprocess" job does not run until all
    # "process" jobs have finished by making it wait for "map" to
    # finish using the depends_on argument (this is already done for
    # you in the invocation of the "postprocess" job in "main").

    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug("** OUTPUT DIR: %s" % os.listdir('output/'))

    os.rename(bam_root+'_bismark.bam', bam_root+'.bam')
    return {
        "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'.bam')),
        "report_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'_bismark_map_report.txt'))
    }
def sort_bam(job_inputs):
    input_bam = dxpy.DXFile(job_inputs['input_bam'])
    fn = input_bam.describe()['name']
    dxpy.download_dxfile(input_bam.get_id(), fn)

    # Sort and optionally remove unmapped and multimapped reads
    sorted_ofn = os.path.splitext(fn)[0] + '_sorted.bam'
    cmd = '/sambamba sort -t {0} -o /dev/stdout {1} '.format(
        multiprocessing.cpu_count() - 1, fn)
    if job_inputs['quality_filter']:
        cmd += '| /sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -o /dev/stdout /dev/stdin '
    cmd += '> ' + sorted_ofn
    print cmd
    subprocess.check_call(cmd, shell=True)

    # Count mapped, unique reads.
    cmd = '/sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -c ' + sorted_ofn
    print cmd
    num_uniquely_mapped_reads = int(
        subprocess.check_output(cmd, shell=True).strip())

    pcr_bottleneck_coefficient = calc_pcr_bottleneck_coefficient(sorted_ofn)

    final_ofn = sorted_ofn
    if job_inputs['remove_duplicates']:
        deduped_ofn = os.path.splitext(sorted_ofn)[0] + '_deduped.bam'
        md_metrics_ofn = os.path.splitext(
            sorted_ofn)[0] + '_deduped_metrics.txt'
        cmd = get_java_cmd()
        cmd += ' -jar /MarkDuplicates.jar I={0} O={1} METRICS_FILE={2} ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true '.format(
            sorted_ofn, deduped_ofn, md_metrics_ofn)
        print cmd
        subprocess.check_call(cmd, shell=True)
        bam_file = dxpy.dxlink(dxpy.upload_local_file(deduped_ofn).get_id())
        metrics_file = dxpy.dxlink(
            dxpy.upload_local_file(md_metrics_ofn).get_id())

        final_ofn = deduped_ofn
    else:
        bam_file = dxpy.dxlink(dxpy.upload_local_file(sorted_ofn).get_id())
        metrics_file = None

    return {
        'output_bam': bam_file,
        'dedup_metrics_file': metrics_file,
        'qc_uniquely_mapped_reads': num_uniquely_mapped_reads,
        'qc_pcr_bottleneck_coefficient': pcr_bottleneck_coefficient
    }
Example #39
0
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype):
    # Download inputs
    reads = [
        dx_utils.download_and_gunzip_file(f, skip_decompress=True)
        for f in reads
    ]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # configure preset params
    if datatype == 'PacBio':
        preset_param = 'map-pb'
    else:
        preset_param = 'map-ont'

    # Iterate over reads files
    output_ofns = []
    for read in reads:
        output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read)
        ofn = '{0}.mapped.bam'.format(output_prefix)
        # Get help info
        dx_utils.run_cmd(['minimap2', '-h'])
        # Call minimap2
        minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read]
        view_cmd = [
            'sambamba', 'view', '--sam-input', '--format=bam',
            '--compression-level=0', '/dev/stdin'
        ]
        sort_cmd = [
            'sambamba', 'sort', '-m',
            '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn,
            '-t',
            str(multiprocessing.cpu_count()), '/dev/stdin'
        ]
        dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd)

        # index
        dx_utils.run_cmd(['sambamba', 'index', ofn])
        # append to outputs
        output_ofns.append(ofn)
    return {
        'mapped_reads':
        [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
        'mapped_reads_index': [
            dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
            for ofn in output_ofns
        ]
    }
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'):

    inputFiles = []
    for i in range(len(BAMs)):
        fh = dxpy.DXFile(BAMs[i])
        dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i))

    name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam")

    # Fill in your application code here.

    command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params)
    for i in range(len(BAMs)):
        command += " INPUT=input%d.bam" % (i)
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    BAM = dxpy.upload_local_file("%s.bam" % name);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["BAM"] = dxpy.dxlink(BAM)

    return output
Example #41
0
def file_handler_from_fid(fid):
    '''Returns dx file handler from fid.'''
    try:
        dxlink = FILES[fid]
    except:
        dxlink = dxpy.dxlink(fid)
    return dxpy.get_handler(dxlink)
Example #42
0
def copy_files(fids, projectId, folder, overwrite=False):
    '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.'''
    newFids = []
    for fid in fids:
        fileDict = dxpy.describe(FILES[fid])  # FILES contain dxLinks
        if fileDict['project'] == projectId:
            # cannot copy into the same project!!!
            # so just leave in place and pretend that we did!
            #proj = dxpy.DXProject(projectId)
            #proj.move(folder,[fid])
            newFids.append(fid)
            continue

        # Check to see if file already exists.
        alreadyThere = find_file(folder + '/' + fileDict['name'], projectId)
        if alreadyThere is None or overwrite:
            # remove what is alreadyThere?
            #if alreadyThere is not None:
            #    proj = dxpy.DXProject(projectId)
            #    proj.remove_objects([alreadyThere])
            dxFile = dxpy.get_handler(FILES[fid])
            newLink = dxpy.dxlink(dxFile.clone(projectId, folder))
        else:
            newLink = FILES(alreadyThere)
        if newLink == None:
            print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \
                    "' to '" + projectId + ":" + folder + "'."
            sys.exit(1)
        newDict = dxpy.describe(newLink)
        FILES[newDict['id']] = newLink
        newFids.append(newDict['id'])

    return newFids
Example #43
0
def copy_across_regions(local_path, record, dest_region, dest_proj,
                        dest_folder):
    print("copy_across_regions {} {} {} {}:{}".format(local_path,
                                                      record.get_id(),
                                                      dest_region,
                                                      dest_proj.get_id(),
                                                      dest_folder))
    # check if we haven't already created this record, and uploaded the file
    dest_asset = find_asset(dest_proj, dest_folder)
    if dest_asset is not None:
        print("Already copied to region {}".format(dest_region))
        return AssetDesc(dest_region, dest_asset.get_id(), dest_proj)

    # upload
    dest_proj.new_folder(dest_folder, parents=True)
    dxfile = upload_local_file(local_path, dest_proj, dest_folder, hidden=True)
    fid = dxfile.get_id()
    dest_asset = dxpy.new_dxrecord(name=record.name,
                                   types=['AssetBundle'],
                                   details={'archiveFileId': dxpy.dxlink(fid)},
                                   properties=record.get_properties(),
                                   project=dest_proj.get_id(),
                                   folder=dest_folder,
                                   close=True)
    return AssetDesc(dest_region, dest_asset.get_id(), dest_proj)
Example #44
0
def main(gvcf, N, sample_name_prefix, output_name):
    K = len(gvcf)

    # download all the source gVCFs
    sh("dx-download-all-inputs --parallel")

    # create output directory
    os.mkdir("gvcf")

    # parallel generate gVCF files
    pool = Pool(multiprocessing.cpu_count())
    inputs = [{
        "source_index": i % K,
        "sample_name_prefix": sample_name_prefix,
        "dest_index": i
    } for i in xrange(N)]
    pool.map(generate_gvcf_kwargs, inputs)

    # tar and upload
    dxid = subprocess.check_output([
        "/bin/bash", "-e", "-o", "pipefail", "-c",
        'tar cv gvcf | dx upload --brief --destination "{}.tar" -'.format(
            output_name)
    ]).strip()
    return {"tar": dxpy.dxlink(dxid)}
Example #45
0
def geneBody_coverage(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    # split mappings into chunks that can be done on a single worker
    # all mappings are loaded into RAM so can only do 5 million at a time
    run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"]))
    run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"]))
    files = os.listdir(".")
    jobs = []
    for f in files:
        if f.startswith("split_map"):
            # add header 
            run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"]))
            # convert to BAM
            run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"]))
            # upload file
            split_bam = dxpy.upload_local_file("temp.bam")
            # run analysis
            jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc"))
            
    run_shell( "ls -l" )

    gbc_agg_input = {"sub_reports":[]}
    for j in jobs:
        gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"})

    agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id()
    
    return {"results":{"job":agg_job, "field":"cover"}}
Example #46
0
def copy_files(fids, projectId, folder, overwrite=False):
    '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.'''
    newFids = []
    for fid in fids:
        fileDict = dxpy.describe(FILES[fid]) # FILES contain dxLinks
        if fileDict['project'] == projectId:
            # cannot copy into the same project!!!
            # so just leave in place and pretend that we did!
            #proj = dxpy.DXProject(projectId)
            #proj.move(folder,[fid])
            newFids.append( fid )
            continue

        # Check to see if file already exists.
        alreadyThere = find_file(folder+'/'+fileDict['name'],projectId)
        if alreadyThere is None or overwrite:
            # remove what is alreadyThere?
            #if alreadyThere is not None:
            #    proj = dxpy.DXProject(projectId)
            #    proj.remove_objects([alreadyThere])
            dxFile = dxpy.get_handler(FILES[fid])
            newLink = dxpy.dxlink(dxFile.clone(projectId, folder))
        else:
            newLink = FILES(alreadyThere)
        if newLink == None:
            print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \
                    "' to '" + projectId + ":" + folder + "'."
            sys.exit(1)
        newDict = dxpy.describe(newLink)
        FILES[newDict['id']] = newLink
        newFids.append( newDict['id'] )

    return newFids
Example #47
0
def test_mapping():
    dxpy.set_workspace_id('project-BpBjyqQ0Jk0Xv2B11Q8P6X59')
    applet = dxpy.find_one_data_object(
        name='bwa_mem_fastq_read_mapper',
        classname='applet',
        return_handler=True,
        zero_ok=False,
        project='project-B406G0x2fz2B3GVk65200003')
    applet.run({
        'genomeindex_targz':
        dxpy.dxlink('file-B6qq53v2J35Qyg04XxG0000V'),
        'reads_fastqgz':
        dxpy.dxlink('file-BpBjzFQ0Jk0Xk73YqQgJKg9Z'),
        'reads2_fastqgz':
        dxpy.dxlink('file-BpBk0400Jk0Xk73YqQgJKg9f')
    })
Example #48
0
    def upload_lane_html(self, raw_properties, tags):
        '''Upload lane.html file to DNAnexus project.
        
        Args:
            local_file_path (str): Local path of sample sheet.
            raw_properties (dict): Properties with values of different types.

        Returns:
            str: DXLink to lane.html file on DNAnexus object store.

        '''

        # Convert all property values to strings
        properties = {key: str(value) for key, value in raw_properties.items()}
        properties['file_type'] = 'lane_html'

        project_folder = '{}/miscellany'.format(self.project_path)

        local_file_path = (
            '{}/Reports/html/'.format(LOCAL_OUTPUT) +
            '{}/all/all/all/lane.html'.format(properties['flowcell_id']))
        remote_file_name = '{}_L{}.lane.html'.format(properties['run_name'],
                                                     properties['lane_index'])
        lane_html_dxid = dxpy.upload_local_file(filename=local_file_path,
                                                name=remote_file_name,
                                                properties=properties,
                                                tags=tags,
                                                project=self.project_dxid,
                                                folder=project_folder,
                                                parents=True)
        return dxpy.dxlink(lane_html_dxid)
Example #49
0
    def upload_tools_used(self, tools_used_dict, raw_properties):
        '''Write console commands to Tools Used file & upload.

        Args:
            tools_used_dict (dict): Description of executables & configurations.
            raw_properties (dict): Properties with values of different types.

        Returns:
            str: DXLink to "tools used" file on DNAnexus object store.
        '''

        # Convert all property values to strings
        properties = {key: str(value) for key, value in raw_properties.items()}
        properties['file_type'] = 'lane_html'

        # Write file
        local_file_path = 'bcl2fastq_tools_used.json'
        with open(local_file_path, 'w') as TOOLS:
            TOOLS.write(json.dumps(tools_used_dict))

        # Upload file
        properties['file_type'] = 'tools_used'
        project_folder = '{}/miscellany'.format(self.project_path)
        tools_used_dxid = dxpy.upload_local_file(filename=local_file_path,
                                                 properties=properties,
                                                 project=self.project_dxid,
                                                 folder=project_folder,
                                                 parents=True)
        return dxpy.dxlink(tools_used_dxid)
Example #50
0
    def test_unpaired(self):
        bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id']
        mappings = dxpy.find_one_data_object(name="unpaired_RNA-Seq_mappings", typename="LetterMappings")['id']
        if bed_file == None:
            print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed.  Please upload it"
            return False
        if mappings == None:
            print "Cannot find unpaired_RNA-Seq_mappings.  Please upload it"
            return False

        input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 
                  'bed_file': dxpy.dxlink(bed_file) }

        print "Running program with", input
        job = self.program.run(input)
        print "launched test_unpaired ", job.get_id()
Example #51
0
def main(fastq, genomeindex_targz):

    print "something else"
    fastq_dxfile = dxpy.DXFile(fastq)
    dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq")

    genome_dxfile = dxpy.DXFile(genomeindex_targz)
    dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz")
    os.makedirs("genome")
    tar_cmd = "tar xzvf genome.tar.gz -C genome"
    subprocess.check_call(tar_cmd, shell=True)
    genome_file = glob.glob("genome/*.bwt")[0]
    genome_file = re.sub("\.bwt$", "", genome_file)

    bwa_cmd = (
        "bwa mem -t {nproc} {genome} {fastq} | "
        "samtools view -u -S - | "
        "samtools sort -m 256M -@ {nproc} - output".format(
            nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq"
        )
    )
    subprocess.check_call(bwa_cmd, shell=True)

    bam = dxpy.upload_local_file("output.bam")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["bam"] = dxpy.dxlink(bam)

    return output
Example #52
0
def file_handler_from_fid(fid):
    '''Returns dx file handler from fid.'''
    try:
        dxlink = FILES[fid]
    except:
        dxlink = dxpy.dxlink(fid)
    return dxpy.get_handler(dxlink)
Example #53
0
def produce_qc_report(individual_json_outputs,
                      sample_name,
                      output_project,
                      output_folder,
                      properties={}):
    """Combine the various statistics collected into a single dict for
    output."""

    output = {'Sample name': sample_name}
    misc_subfolder = output_folder + '/miscellany'

    for j in individual_json_outputs:
        for k in j:
            if k in output:
                output[k].update(j[k])
            else:
                output[k] = j[k]

    ofn = sample_name + '_stats.json'
    with open(ofn, 'w') as output_fh:
        output_fh.write(json.dumps(output))

    properties['file_type'] = 'qc_stats'
    output_json_file = dxpy.upload_local_file(filename=ofn,
                                              project=output_project,
                                              properties=properties,
                                              folder=misc_subfolder,
                                              parents=True)

    return {'combined_json_file': dxpy.dxlink(output_json_file)}
def main(input_file):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_file = dxpy.DXFile(input_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_file.get_id(), "input_file")

    # Fill in your application code here.

    subprocess.check_call(
        "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file",
        shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output_file = dxpy.upload_local_file("output_file")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["output_file"] = dxpy.dxlink(output_file)

    return output
Example #55
0
def main(inputs):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    extension = splitext(splitext(input_filenames[-1])[0])[
        1]  #uses last extension - presumably they are all the same
    pooled_filename = '-'.join(
        [splitext(splitext(fn)[0])[0]
         for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = run_pipe(
        ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["pooled"] = dxpy.dxlink(pooled)

    return output
def main(repo_url, ref='master', credentials=None, build_options=None):

    clone_dir = app_builder.clone_repository(repo_url, ref=ref, credentials=credentials)

    applet_id = app_builder.create_applet(clone_dir, build_options=build_options)

    return { 'output_applet': dxpy.dxlink(applet_id) }
Example #57
0
    def compile_output_generic(oname, ovalue):
        if isinstance(ovalue, list):
            return [ compile_output_generic(oname, x) for x in ovalue ]
        elif isinstance(ovalue, dict):
            if is_output_file(ovalue):
                def remove_prefix(text, prefix):
                    if text.startswith(prefix):
                        return text[len(prefix):]
                    return text
                def upload_file(ovalue):
                    sh("unset DX_WORKSPACE_ID && dx cd $DX_PROJECT_CONTEXT_ID: && dx mkdir -p {}".format(folder))
                    return dxpy.dxlink(dxpy.upload_local_file(remove_prefix(ovalue['location'], "file://"), wait_on_close=True, project=dxpy.PROJECT_CONTEXT_ID, folder=folder))

                if skip_downloads:
                    files = dxpy.dxlink(open(remove_prefix(ovalue['location'], "file://")).read().rstrip())
                else:
                    files = upload_file(ovalue)
                if 'secondaryFiles' in ovalue:
                    files = {'primaryFile': files, 'secondaryFiles': compile_output_generic(oname, ovalue['secondaryFiles'])}
                return files

            # TODO: This feature needs to be completed to reset env here, smartly check whether files exist already, and work for inputs
            elif is_output_directory(ovalue):
                sh("unset DX_WORKSPACE_ID && dx cd $DX_PROJECT_CONTEXT_ID: && dx upload -r {}".format(ovalue['path']))
                return ovalue
            else:
                return { k : compile_output_generic(k,v) for k,v in ovalue.items() }
        else:
            return ovalue
Example #58
0
def postprocess(**inputs):
    kwargs = inputs["kwargs"]
    subjob_outputs = inputs["subjob_outputs"] 
    print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs))

    output_prefix = kwargs["output_prefix"]
    variant_suffixes = kwargs["variant_suffixes"]
    
    app_output_fn = {}
    for subjob_output in subjob_outputs:
        for type, id in subjob_output.iteritems():
            file_id = id["$dnanexus_link"]
            filename = output_prefix + "_" + variant_suffixes[type]
            
            print "Downloading " + str(file_id) + " into " + filename
            dxpy.download_dxfile(dxid=file_id, filename=filename, append=True)
            app_output_fn[type] = filename

    postprocess_outputs = {}
    need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"]
    for type, fn in app_output_fn.iteritems():
        out_fn = fn
        if type in need_to_renumber:
            out_fn = RenumberMergedOutput(fn, fn+"_renumbered") 
        print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn)
        postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn))

    if kwargs["export_vcf"]:
        DownloadRefFasta(kwargs["reference_fasta"])
        postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") 
        
    return postprocess_outputs
Example #59
0
def combine_files(countDXlinks, resultfn):
    """The 'gather' subjob of the applet.

    Arguments:
        countDXlinks (list[dict]): list of DXlinks to process job output files.
        resultfn (str): Filename to use for job output file.

    Returns:
        DXLink for the main function to return as the job output.

    Note: Only the DXLinks are passed as parameters.
    Subjobs work on a fresh instance so files must be downloaded to the machine
    """
    if resultfn.endswith(".bam"):
        resultfn = resultfn[:-4] + '.txt'

    sum_reads = 0
    with open(resultfn, 'w') as f:
        for i, dxlink in enumerate(countDXlinks):
            dxfile = dxpy.DXFile(dxlink)
            filename = "countfile{0}".format(i)
            dxpy.download_dxfile(dxfile, filename)
            with open(filename, 'r') as fsub:
                for line in fsub:
                    sum_reads += parse_line_for_readcount(line)
                    f.write(line)
        f.write('Total Reads: {0}'.format(sum_reads))

    countDXFile = dxpy.upload_local_file(resultfn)
    countDXlink = dxpy.dxlink(countDXFile.get_id())

    return {"countDXLink": countDXlink}