Beispiel #1
0
def main(**job_inputs):

    output = {}
    os.mkdir('/home/dnanexus/split')

    # make k-mer histogra w/ meryl
    _write_generator_file(job_inputs['sequences_fastx'], GENERATOR_FILENAME)
    _run_meryl(output, job_inputs["sequences_fastx"], job_inputs["k_mer_size"],job_inputs["is10x"])

    # Run Genomescope
    mem_in_b = dx_utils.run_cmd("head -n1 /proc/meminfo | awk '{print int($2*0.6*1024)}'", returnOutput=True)
    read_length = _get_read_length()
    cmd = ['Rscript', './genomescope.R', "mer_counts.tsv", str(job_inputs['k_mer_size']),
           str(read_length), './', str(MAX_KMER_COVERAGE)
    ]
    _run_cmd(cmd)
    genomescope_summary = _get_genomescope_summary('summary.txt', os.path.exists('model.txt'))

    # Upload the output files.
    output['genomescope_figures'] = [
        dxpy.upload_local_file('plot.png', name='{0}.gs.png'.format(job_inputs['output_prefix'])),
        dxpy.upload_local_file('plot.log.png', name='{0}.gs.log.png'.format(job_inputs['output_prefix']))
    ]
    output['genomescope_files'] = [
        dxpy.upload_local_file('summary.txt', name='{0}.summary.txt'.format(job_inputs['output_prefix'])),
        dxpy.upload_local_file('progress.txt', name='{0}.progress.txt'.format(job_inputs['output_prefix']))
    ]
    # If GenomeScope failed to converge for some reason, there will be no model.txt file.
    if os.path.exists('model.txt'):
        output['genomescope_files'].append(dxpy.upload_local_file('model.txt', name='{0}.model.txt'.format(job_inputs['output_prefix'])))
    output.update(genomescope_summary)

    return output
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root):
    '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8'''

    print "* coverage(): Retrieve context files and index..."
    CpG_context = 'output/CpG_context_%s.txt' % target_root
    CHG_context = 'output/CHG_context_%s.txt' % target_root
    CHH_context = 'output/CHH_context_%s.txt' % target_root
    run_cmd('mkdir -p output/')
    dxpy.download_dxfile(CpG_context_dxlink, CpG_context)
    dxpy.download_dxfile(CHG_context_dxlink, CHG_context)
    dxpy.download_dxfile(CHH_context_dxlink, CHH_context)
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)

    print "* coverage(): Uncompress index..."
    run_cmd('tar -zxf ' + dme_ix)

    (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context)
    
    print "* coverage(): Storing coverage results..."
    cx_report_dxfile = dxpy.upload_local_file(cx_report)
    bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz)

    print "* coverage(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        "cx_report_dxlink":     dxpy.dxlink(cx_report_dxfile),
        "bedgraph_gz_dxlink":   dxpy.dxlink(bedgraph_gz_dxfile)
    }
Beispiel #3
0
def main ( HiC_norm_binning_hdf5, HiC_data_object_hdf5, fend_object_hdf5, chromosome, contact_matrix_binsize, chrlen_file):
 
    dxpy.download_dxfile(HiC_norm_binning_hdf5, "HiC_norm_binning.hdf5")
    dxpy.download_dxfile(HiC_data_object_hdf5, "HiC_data_object.hdf5")
    dxpy.download_dxfile(fend_object_hdf5, "fend_object.hdf5")
    dxpy.download_dxfile(chrlen_file, "chrlen_file")
    
    
    command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_norm_contact_matrix.arg.py HiC_norm_binning.hdf5 {chromosome} {contact_matrix_binsize} chrlen_file .'".format(chromosome=chromosome, contact_matrix_binsize=contact_matrix_binsize)
    print(command)
    subprocess.call(command, shell=True)
    
    observed_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_observed_contact_matrix*.txt",shell=True).strip()
    normalized_fend_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_fend_contact_matrix*.txt",shell=True).strip()
    normalized_enrich_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_enrich_contact_matrix*.txt",shell=True).strip()
    expected_fend_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_expected_fend_contact_matrix*.txt",shell=True).strip()
    expected_enrich_contact_matrix_filename = subprocess.check_output("ls -1 HiCtool_expected_enrich_contact_matrix*.txt",shell=True).strip()
    
    observed_contact_matrix_file = dxpy.upload_local_file(observed_contact_matrix_filename)
    normalized_fend_contact_matrix_file = dxpy.upload_local_file(normalized_fend_contact_matrix_filename)
    normalized_enrich_contact_matrix_file = dxpy.upload_local_file(normalized_enrich_contact_matrix_filename)
    expected_fend_contact_matrix_file =  dxpy.upload_local_file(expected_fend_contact_matrix_filename)
    expected_enrich_contact_matrix_file = dxpy.upload_local_file(expected_enrich_contact_matrix_filename)

    return { "observed_contact_matrix": observed_contact_matrix_file, "normalized_fend_contact_matrix": normalized_fend_contact_matrix_file, "normalized_enrich_contact_matrix": normalized_enrich_contact_matrix_file, "expected_fend_contact_matrix": expected_fend_contact_matrix_file, "expected_enrich_contact_matrix": expected_enrich_contact_matrix_file } 
Beispiel #4
0
def create_index_file(bam_filename, bam_dxlink):
    """Create Index file.
    Sorts BAM if needed
    """
    print("Creating Index file.")
    index_filename = "{bam}.bai".format(bam=bam_filename)
    cmd_index = ['samtools', 'index', bam_filename]
    sorted_filename = bam_filename
    try:
        run_cmd(cmd_index)
    except NotIndexedException:
        print("Sorting BAM")
        sorted_filename = bam_filename[:-4] + '.sorted.bam'
        cmd_sort = [
            'samtools',
            'sort',
            bam_filename,
            bam_filename[:-4] + '.sorted']
        run_cmd(cmd_sort)
        print("Indexing BAM")
        index_cmd = ['samtools', 'index', sorted_filename]
        index_filename = "{sorted_bam_name}.bai".format(
            sorted_bam_name=sorted_filename)
        run_cmd(index_cmd)
    finally:
        index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename))
        aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename))
        return aligned_sorted_bam, index_file_link
Beispiel #5
0
    def get_use_bases_mask(self, output_folder):
        '''
        command = "python calculate_use_bases_mask.py {runinfoFile} {sampleSheet} {lane}"
        gbsc_utils.createSubprocess(cmd=command)
        '''

        misc_subfolder = output_folder + '/miscellany'
        run_info_file = 'RunInfo.xml'

        command = 'python calculate_use_bases_mask.py %s ' % run_info_file
        command += '%s ' % self.sample_sheet
        command += '%d ' % int(self.lane_index)
        command += '%d' % int(self.bcl2fastq_version)

        stdout, stderr = self.createSubprocess(cmd=command, pipeStdout=True)
        self.use_bases_mask = stdout
        print 'This is use_bases_mask value: %s' % self.use_bases_mask

        use_bases_mask_file = 'use_bases_mask.txt'
        with open(use_bases_mask_file, 'w') as OUT:
            OUT.write(self.use_bases_mask)
        dxpy.upload_local_file(filename=use_bases_mask_file,
                               properties=None,
                               project=self.lane_project_id,
                               folder=misc_subfolder,
                               parents=True)
        return self.use_bases_mask
def create_index_file(bam_filename, bam_dxlink):
    """Create Index file.
    Sorts BAM if needed
    """
    print "Creating Index file."
    index_filename = "{bam}.bai".format(bam=bam_filename)
    cmd_index = ['samtools', 'index', bam_filename]
    sorted_filename = bam_filename
    try:
        run_cmd(cmd_index)
    except NotIndexedException:
        print "Sorting BAM"
        sorted_filename = bam_filename[:-4] + '.sorted.bam'
        cmd_sort = [
            'samtools',
            'sort',
            bam_filename,
            bam_filename[:-4] + '.sorted']
        run_cmd(cmd_sort)
        print "Indexing BAM"
        index_cmd = ['samtools', 'index', sorted_filename]
        index_filename = "{sorted_bam_name}.bai".format(
            sorted_bam_name=sorted_filename)
        run_cmd(index_cmd)
    finally:
        index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename))
        aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename))
        return aligned_sorted_bam, index_file_link
Beispiel #7
0
def main ( bam1, bam2, RE_site_bed):

    dxpy.download_dxfile(bam1, "input1.bam")
    dxpy.download_dxfile(bam2, "input2.bam")
    dxpy.download_dxfile(RE_site_bed, "RE.bed")
    
    command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_hifive.arg.py input1.bam input2.bam RE.bed .'"
    print(command)
    subprocess.call(command, shell=True)
    

    fend_object_hdf5_filename = "./fend_object.hdf5"
    HiC_data_object_hdf5_filename = "./HiC_data_object.hdf5"
    HiC_distance_function_hdf5_filename = "./HiC_distance_function.hdf5"
    HiC_norm_binning_hdf5_filename = "./HiC_norm_binning.hdf5"
    HiC_project_object_hdf5_filename = "./HiC_project_object.hdf5"
    
    #fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename, folder=outdir)
    #HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename, folder=outdir)
    #HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename, folder=outdir)
    #HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename, folder=outdir)
    #HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename, folder=outdir)

    fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename)
    HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename)
    HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename)
    HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename)
    HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename)

    return { "fend_object_hdf5": fend_object_hdf5_file, "HiC_data_object_hdf5": HiC_data_object_hdf5_file, "HiC_distance_function_hdf5": HiC_distance_function_hdf5_file, "HiC_norm_binning_hdf5": HiC_norm_binning_hdf5_file, "HiC_project_object_hdf5": HiC_project_object_hdf5_file }
Beispiel #8
0
def main ( HiC_norm_binning_hdf5, HiC_data_object_hdf5, fend_object_hdf5, contact_matrix_binsize, chrlen_file):
 
    dxpy.download_dxfile(HiC_norm_binning_hdf5, "HiC_norm_binning.hdf5")
    dxpy.download_dxfile(HiC_data_object_hdf5, "HiC_data_object.hdf5")
    dxpy.download_dxfile(fend_object_hdf5, "fend_object.hdf5")
    dxpy.download_dxfile(chrlen_file, "chrlen_file")
    
    FH_chrlen = open("chrlen_file","r") 
    chrs = [line.strip().split('\t')[0] for line in FH_chrlen];
    matrix_list_filename = "HiCtool_matrix_list.txt"
    fout = open(matrix_list_filename,"w")

    for chr in chrs:
      command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_norm_contact_matrix.arg.py HiC_norm_binning.hdf5 {chromosome} {contact_matrix_binsize} chrlen_file .'".format(chromosome=chr, contact_matrix_binsize=contact_matrix_binsize)
      print(command)
      subprocess.call(command, shell=True)
      observed_matrix_filename = subprocess.check_output("ls -1 HiCtool_observed_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip()
      normalized_fend_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_fend_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip()
      normalized_enrich_matrix_filename = subprocess.check_output("ls -1 HiCtool_normalized_enrich_contact_matrix_chr{chr}_*txt".format(chr=chr), shell=True).strip()
      fout.write("chr"+chr+"\t"+observed_matrix_filename+"\t"+normalized_fend_matrix_filename+"\t"+normalized_enrich_matrix_filename+'\n')

    fout.close()
    FH_chrlen.close()    

    all_contact_matrices_filename = "HiCtool_contact_matrices.tar.gz"
    subprocess.call("tar -czf {gzfile} HiCtool_*contact_matrix*txt".format(gzfile=all_contact_matrices_filename), shell=True)
    all_contact_matrices_file = dxpy.upload_local_file(all_contact_matrices_filename)
    matrix_list_file = dxpy.upload_local_file(matrix_list_filename)

    return { "all_contact_matrices": all_contact_matrices_file, "matrix_list": matrix_list_file } 
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"])


    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name);
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name);
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
def process(fastq):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    print fastq
    reads_filename = dxpy.describe(fastq)["name"]
    reads_basename = reads_filename.rstrip(".gz").rstrip(".fq").rstrip(".fastq")
    reads_file = dxpy.download_dxfile(fastq, "fastq.gz")

    subprocess.check_call(["mkdir", "output"])
    print "Run QC"
    fqc_command = "/usr/bin/FastQC/fastqc fastq.gz -o output"
    print fqc_command
    stdio = subprocess.check_output(shlex.split(fqc_command))
    print stdio
    print subprocess.check_output(["ls", "-l", "output"])
    subprocess.check_call(["unzip", "output/fastq_fastqc.zip"])
    print "Upload results"
    subprocess.check_call(["mv", "fastq_fastqc/fastqc_data.txt", "%s_data.txt" % reads_basename])
    subprocess.check_call(["mv", "fastq_fastqc/summary.txt", "%s_summary.txt" % reads_basename])
    subprocess.check_call(["mv", "output/fastq_fastqc.zip", "%s_fastqc.zip" % reads_basename])
    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename)
    zip_dxfile = dxpy.upload_local_file("%s_fastqc.zip" % reads_basename)
    print report_dxfile
    return {"report": report_dxfile, "summary": summary_dxfile, "zip": zip_dxfile}
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'],
                                  [".bam", ".BAM", ".sam", ".SAM"])

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name)
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name)
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
def create_final_set_of_peak_calls(job_inputs):
    replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']]
    pseudo_replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files']]
    pooled_pseudo_replicate_idr_prefix = job_inputs['pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '')

    (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds(replicate_idr_prefixes,
                                                                                    pseudo_replicate_idr_prefixes,
                                                                                    pooled_pseudo_replicate_idr_prefix,
                                                                                    job_inputs['replicate_peaks_threshold'],
                                                                                    job_inputs['pseudo_replicate_peaks_threshold'],
                                                                                    job_inputs['pooled_pseudo_replicate_peaks_threshold'])
    max_numPeaks_Rep = max(num_peaks_each_rep)

    pooled_replicates_peaks_fn = download_and_gunzip_file(job_inputs['pooled_replicate_peaks_file'])
    coi = {'signal.value': 7, 'p.value': 8, 'q.value': 9}[job_inputs['ranking_measure']]
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0)
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    conservative_result = dxpy.upload_local_file('{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix']))
    optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(job_inputs['output_prefix']))

    return {'conservative_peak_calls': dxpy.dxlink(conservative_result),
            'optimal_peak_calls': dxpy.dxlink(optimal_result),
            'num_peaks_each_rep': num_peaks_each_rep,
            'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep,
            'num_peaks_pooled_pseudo_rep': numPeaks_Rep0}
Beispiel #13
0
def makeInputsBwa():
    try:
        contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id'])
        reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id'])
    except StopIteration:
        raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them")

    genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True)
    contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)}
    print "Running fasta_contigset_importer with", contigset_importer_input
    job = contigset_importer.run(contigset_importer_input)
    job.wait_on_done()
    contig_set = job.describe()["output"]["contig_set"]

    left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True)
    right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True)
    #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True)
    #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True)

    reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)}
    print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input
    job = reads_importer.run(reads_importer_input)
    job.wait_on_done()
    reads = job.describe()["output"]["reads"]

    return {"reads": [reads] * 3, "reference": contig_set}
    def test_get_appet_with_asset(self):
        bundle_name = "test-bundle-depends.tar.gz"
        bundle_tmp_dir = tempfile.mkdtemp()
        os.mkdir(os.path.join(bundle_tmp_dir, "a"))
        with open(os.path.join(bundle_tmp_dir, 'a', 'foo.txt'), 'w') as file_in_bundle:
            file_in_bundle.write('foo\n')
        subprocess.check_call(['tar', '-czf', os.path.join(bundle_tmp_dir, bundle_name),
                               '-C', os.path.join(bundle_tmp_dir, 'a'), '.'])
        bundle_file = dxpy.upload_local_file(filename=os.path.join(bundle_tmp_dir, bundle_name),
                                             project=self.project,
                                             wait_on_close=True)

        asset_file = dxpy.upload_local_file(filename=os.path.join(bundle_tmp_dir, bundle_name),
                                            project=self.project,
                                            wait_on_close=True)

        dxrecord_details = {"archiveFileId": {"$dnanexus_link": asset_file.get_id()}}
        dxrecord = dxpy.new_dxrecord(project=self.project, types=["AssetBundle"], details=dxrecord_details,
                                     name='asset-lib-test', properties={"version": "0.0.1"})
        dxrecord.close()
        asset_bundle_id = dxrecord.get_id()

        asset_file.set_properties({"AssetBundle": asset_bundle_id})

        code_str = """#!/bin/bash
                    main(){
                        echo 'Hello World'
                    }
                    """
        app_spec = {
            "name": "asset_depends",
            "dxapi": "1.0.0",
            "runSpec": {
                "code": code_str,
                "interpreter": "bash",
                "assetDepends":  [{"id": asset_bundle_id}],
                "bundledDepends": [{"name": bundle_name, "id": {"$dnanexus_link": bundle_file.get_id()}}]
            },
            "inputSpec": [],
            "outputSpec": [],
            "version": "1.0.0"
        }
        app_dir = self.write_app_directory("asset_depends", json.dumps(app_spec))
        asset_applet_id = json.loads(run("dx build --json {app_dir}".format(app_dir=app_dir)))["id"]
        with chdir(tempfile.mkdtemp()):
            run("dx get --omit-resources " + asset_applet_id)
            self.assertTrue(os.path.exists("asset_depends"))
            self.assertFalse(os.path.exists(os.path.join("asset_depends", "resources")))
            self.assertTrue(os.path.exists(os.path.join("asset_depends", "dxapp.json")))

            applet_spec = json.load(open(os.path.join("asset_depends", "dxapp.json")))
            self.assertEqual([{"name": "asset-lib-test",
                               "project": self.project,
                               "folder": "/",
                               "version": "0.0.1"}
                              ],
                             applet_spec["runSpec"]["assetDepends"])
            self.assertEqual([{"name": bundle_name, "id": {"$dnanexus_link": bundle_file.get_id()}}],
                             applet_spec["runSpec"]["bundledDepends"])
Beispiel #15
0
def main(input_bam, paired_end):

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    subprocess.check_output('ls -l', shell=True)

    # ===================
    # Create tagAlign file
    # ===================
    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename),
        "gzip -cn"],
        outfile=final_TA_filename)

    subprocess.check_output('ls -l', shell=True)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        command = \
            "samtools sort -@ %d -n %s %s" \
            % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(command)
        subprocess.check_call(shlex.split(command))

        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename),
            "gzip -cn"],
            outfile=final_BEDPE_filename)

    subprocess.check_output('ls -l', shell=True)

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    output = {}
    output["tagAlign_file"] = dxpy.dxlink(tagAlign_file)
    if paired_end:
        output["BEDPE_file"] = dxpy.dxlink(BEDPE_file)

    return output
def postprocess(bam_files,
                report_files,
                bam_root,
                nthreads=8,
                use_cat=False,
                use_sort=False):
    # This is the "gather" phase which aggregates and performs any
    # additional computation after the "map" (and therefore after all
    # the "process") jobs are done.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.debug("** In Postprocess - refactored dme-merge-bams - *")

    versions = "Unknown"
    if os.path.isfile(VERSION_SCRIPT):
        try:
            versions = subprocess.check_output(
                shlex.split(
                    'tool_versions.py --dxjson dnanexus-executable.json'))
        except:
            pass

    merged_bam = merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads)

    (merged_report, report_file_names) = merge_reports(bam_root, report_files,
                                                       bam_root)

    (merged_qc, nreads, metadata) = merge_qc(bam_root, report_file_names)

    props = {
        'SW': versions,
        'reads': nreads,
    }
    output = {
        "bam_techrep":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_bam,
                                   details=metadata,
                                   properties=props)),
        "bam_techrep_qc":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_qc,
                                   details=metadata,
                                   properties={'SW': versions})),
        "map_techrep":
        dxpy.dxlink(
            dxpy.upload_local_file(merged_report,
                                   details=metadata,
                                   properties={'SW': versions})),
        "reads":
        nreads,
        "metadata":
        json.dumps(metadata)
    }
    return output
def main(input_SAM, deviations=None, histogram_width=None, min_percent=None, metric_acc_level=None, ref=None, is_sorted=None, stop_after=None):

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_SAM, "input")
    if ref != None:
        dxpy.download_dxfile(ref, "ref.fa")


    command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar"
    command += " INPUT=input"
    command += " OUTPUT=insert_distribution.txt"
    command += " HISTOGRAM_FILE=histogram.pdf"
    if deviations != None:
        command += " DEVIATIONS="+str(deviations)
    if histogram_width != None:
        command += " HISTOGRAM_WIDTH="+str(histogram_width)
    if min_percent != None:
        command += " MINIMUM_PCT="+str(histogram_width)
    if metric_acc_level != None:
        for level in metric_acc_level:
            command += " METRIC_ACCUMULATION_LEVEL="+str(level)
    if ref != None:
        command += " REFERENCE_SEQUENCE=ref.fa"
    if is_sorted != None:
        if is_sorted:
            command += " ASSUME_SORTED=true"
        else:
            command += " ASSUME_SORTED=false"
    if stop_after != None:
        command += " STOP_AFTER="+str(stop_after)

    print "Executing:"
    print command

    # CALL the command here:
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    histogram = dxpy.upload_local_file("histogram.pdf")
    histogram.rename(dxpy.DXFile(input_SAM).describe()['name']+"_histogram.pdf")
    output_dist = dxpy.upload_local_file("insert_distribution.txt")
    output_dist.rename(dxpy.DXFile(input_SAM).describe()['name']+"_insert_dist.txt")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["histogram"] = dxpy.dxlink(histogram)
    output["output"] = dxpy.dxlink(output_dist)

    return output
Beispiel #18
0
    def upload(self, to_upload, **kwargs):
        """Upload a list of files and directories to a directory.

        This is not a batch level operation.
        If some file errors, the files uploaded before will remain present.

        Args:
            to_upload (List[Union[str, OBSUploadObject]]): A list of posix file names,
                directory names, or OBSUploadObject objects to upload.

        Raises:
            ValueError: When source path is not a directory
            TargetExistsError: When destination directory already exists
        """
        dx_upload_objects = [
            name for name in to_upload if isinstance(name, OBSUploadObject)
        ]
        all_files_to_upload = utils.walk_files_and_dirs([
            name for name in to_upload
            if not isinstance(name, OBSUploadObject)
        ])
        dx_upload_objects.extend([
            OBSUploadObject(
                f,
                object_name=('/' +
                             self.resource if self.resource else Path('')) /
                utils.file_name_to_object_name(f)) for f in all_files_to_upload
        ])

        for upload_obj in dx_upload_objects:
            upload_obj.object_name = Path(upload_obj.object_name)
            upload_obj.source = Path(upload_obj.source)
            dest_file = Path('{drive}{project}:{path}'.format(
                drive=self.drive,
                project=self.canonical_project,
                path=upload_obj.object_name))

            if upload_obj.source.isfile():
                dest_is_file = dest_file.isfile()
                if dest_is_file:  # only occurs if upload is called directly with existing objects
                    logger.warning(
                        'Destination path ({}) already exists, will not cause '
                        'duplicate file objects on the platform. Skipping...'.
                        format(dest_file))
                else:
                    with _wrap_dx_calls():
                        dxpy.upload_local_file(
                            filename=upload_obj.source,
                            project=self.canonical_project,
                            folder='/' + (dest_file.parent.resource or ''),
                            parents=True,
                            name=dest_file.name)
            elif upload_obj.source.isdir():
                dest_file.makedirs_p()
            else:
                raise stor_exceptions.NotFoundError(
                    'Source path ({}) does not exist. Please provide a valid source'
                    .format(upload_obj.source))
Beispiel #19
0
 def once():
     try:
         dxpy.upload_local_file(filename = local_path,
                                project = project.get_id(),
                                folder = destFolder,
                                wait_on_close=True)
         return True
     except:
         return False
Beispiel #20
0
def main(**kwargs):

    dxpy.download_folder(DCC_CREDENTIALS_PROJECT,
                         '.',
                         folder=DCC_CREDENTIALS_FOLDER)
    if 'key' in kwargs:
        key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')])
    else:
        key = dxpy.api.system_whoami()['id']
    key_tuple = common.processkey(key, KEYFILE)
    if not key_tuple:
        logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE))
        raise PortalCredentialsError("Supply a valid keypair ID")
    authid, authpw, server = key_tuple
    if 'url' in kwargs:
        server = kwargs.pop('url')
    keypair = (authid, authpw)

    tokens = ['python3 checkfiles.py']
    for k, v in kwargs.iteritems():
        if isinstance(v, bool):
            if v:
                tokens.append("--" + k.replace('_', '-'))
            continue
        if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int):
            tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)]))

    if 'dx_file' in kwargs:
        dxfile = dxpy.DXFile(kwargs.get('dx_file'))
        local_file = dxpy.download_dxfile(dxfile, dxfile.name)
        tokens.append("--local-file %s" % (dxfile.name))

    # this is just to get a command string to print that has no secrets
    tokens_safe = deepcopy(tokens)
    tokens_safe.append("--username %s --password %s" %
                       ("." * len(authid), "." * len(authpw)))
    tokens_safe.append(server)
    logger.info(' '.join(tokens_safe))

    tokens.append("--username %s --password %s" % (authid, authpw))
    # this needs to be the last token
    tokens.append(server)

    checkfiles_command = ' '.join(tokens)
    subprocess.check_call(shlex.split(checkfiles_command))

    output = {}
    outfilename = kwargs.get('out')
    errfilename = kwargs.get('err')
    if outfilename:
        out = dxpy.upload_local_file(outfilename)
        output.update({'out': dxpy.dxlink(out)})
    if errfilename:
        err = dxpy.upload_local_file(errfilename)
        output.update({'err': dxpy.dxlink(err)})

    return output
Beispiel #21
0
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props):
    '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32'''

    (target_root,biorep_bam) = merge_bams(bam_set, 32)
    (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root)
    (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports)
    
    print "* merge_extract(): Retrieve and uncompress index..."
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)
    run_cmd('tar -zxf ' + dme_ix)

    # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage
    (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root)

    bismark_simple_extract(target_root, alignments, ncores)
    qc_metrics = bismark_qc_metrics(target_root, qc_metrics)

    print "* Retrieve split report..."
    append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc)
    run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True)

    # TODO: Is this even needed?  Currently we do to get the size!
    #if len(bam_set) > 1:  # Wouldn't need to do this unless there is a merge
    #    print "* merge_extract(): Storing biorep bam..."
    #    props_ex = props.copy()
    #    props_ex.update({ 'reads': str(reads) })
    #    biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True))
    #else:
    #    biorep_bam_dxlink = bam_set[0]

    print "* merge_extract(): Storing extraction results..."
    biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics)
    biorep_map_dxfile    = dxpy.upload_local_file(biorep_map,   properties=props,details=qc_metrics)
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    chrom_sizes_dxfile   = dxpy.upload_local_file('input/chrom.sizes')
    mbias_report_dxfile  = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics)
    CpG_context_dxfile   = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root))
    CHG_context_dxfile   = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root))
    CHH_context_dxfile   = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root))

    print "* merge_extract(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        #"biorep_bam_dxlink":    biorep_bam_dxfile,
        "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile),
        "biorep_map_dxlink":    dxpy.dxlink(biorep_map_dxfile),
        "CpG_context_dxlink":   dxpy.dxlink(CpG_context_dxfile),
        "CHG_context_dxlink":   dxpy.dxlink(CHG_context_dxfile),
        "CHH_context_dxlink":   dxpy.dxlink(CHH_context_dxfile),
        "split_report_dxlink":  dxpy.dxlink(split_report_dxfile),
        "chrom_sizes_dxlink":   dxpy.dxlink(chrom_sizes_dxfile),
        "mbias_report_dxlink":  dxpy.dxlink(mbias_report_dxfile),
        "target_root":          target_root,
        "qc_metrics":           qc_metrics
    }
def process(filename, bucket_url, project, folder, skipvalidate=False):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    logger.debug(filename)

    test = list( dxpy.find_data_objects(classname='file',
                           folder=folder, project=project, name_mode='exact',
                           name=filename, return_handler=False) )

    if not test or len(test) == 0:
        #cp the file from the bucket
        subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT)
        subprocess.check_call(shlex.split('ls -l %s' %(filename)))
        dx_file = dxpy.upload_local_file(filename, project=project, folder=folder)

    else:
        dxpy.download_dxfile(test[0]['id'], filename)
        dx_file=dxpy.dxfile.DXFile(test[0]['id'])
    reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq')

    if skipvalidate:
        return {
            "file": dx_file,
            "report": None,
            "summary": None,
            "zip": None
        }

    subprocess.check_call(['mkdir', 'output'])
    logger.info("Run QC")
    fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output"
    logger.debug(fqc_command)
    stdio = subprocess.check_output(shlex.split(fqc_command))
    logger.debug(stdio)
    logger.debug(subprocess.check_output(['ls','-l', 'output']))
    subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename])
    logger.info("Upload results")

    subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ])
    subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ])

    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=folder, project=project)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=folder, project=project)
    zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=folder, project=project)
    logger.debug(report_dxfile)
    return {
        "file": dx_file,
        "report": report_dxfile,
        "summary": summary_dxfile,
        "zip": zip_dxfile
    }
Beispiel #23
0
def main(cons1, cons2, outroot, xchr=True, recalnums=1, skip=20, timemax=7500000.0):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    cons1 = dxpy.DXFile(cons1)
    cons2 = dxpy.DXFile(cons2)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(cons1.get_id(), "cons1")
    dxpy.download_dxfile(cons2.get_id(), "cons2")
    outname1 = outroot + '.psmcfa'
    outname2 = outroot + '.psmc'

    # Fill in your application code here.
    #create the psmcfa file
    createPSMCfa('cons1', 'cons2', outname1, skip)
    print 'Generated the PSMC fasta file.'
    sys.stdout.flush()
    #run psmc the first time
    subprocess.check_call(['psmc', '-t', '15', '-r', '5', '-p', "4+25*2+4+6", '-o', 'test.psmc', outname1])
    print 'Done with first run of PSMC.'
    sys.stdout.flush()
    #run the recal script and run psmc again.
    while (recalnums > 1):
        (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr)
        subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', 'test.psmc', outname1])
        recalnums -= 1
        print 'Recals left', recalnums
        sys.stdout.flush()
    (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr)
    subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname2, outname1])
    print 'Finished final recalibration run.'

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    outfile1 = dxpy.upload_local_file(outname1);
    outfile2 = dxpy.upload_local_file(outname2);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.
    output = {}
    output["outfile1"] = dxpy.dxlink(outfile1)
    output["outfile2"] = dxpy.dxlink(outfile2)

    return output
Beispiel #24
0
def create_final_set_of_peak_calls(job_inputs):
    replicate_idr_prefixes = [
        r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']
    ]
    pseudo_replicate_idr_prefixes = [
        r.replace('.tar.gz', '')
        for r in job_inputs['pseudo_replicate_idr_files']
    ]
    pooled_pseudo_replicate_idr_prefix = job_inputs[
        'pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '')

    (num_peaks_each_rep, num_peaks_each_pseudo_rep,
     numPeaks_Rep0) = get_thresholds(
         replicate_idr_prefixes, pseudo_replicate_idr_prefixes,
         pooled_pseudo_replicate_idr_prefix,
         job_inputs['replicate_peaks_threshold'],
         job_inputs['pseudo_replicate_peaks_threshold'],
         job_inputs['pooled_pseudo_replicate_peaks_threshold'])
    max_numPeaks_Rep = max(num_peaks_each_rep)

    pooled_replicates_peaks_fn = download_and_gunzip_file(
        job_inputs['pooled_replicate_peaks_file'])
    coi = {
        'signal.value': 7,
        'p.value': 8,
        'q.value': 9
    }[job_inputs['ranking_measure']]
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(
        coi, pooled_replicates_peaks_fn, max_numPeaks_Rep,
        job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0)
    cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(
        coi, pooled_replicates_peaks_fn, opt_thresh,
        job_inputs['output_prefix'])
    print cmd
    subprocess.check_output(cmd, shell=True)

    conservative_result = dxpy.upload_local_file(
        '{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix']))
    optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(
        job_inputs['output_prefix']))

    return {
        'conservative_peak_calls': dxpy.dxlink(conservative_result),
        'optimal_peak_calls': dxpy.dxlink(optimal_result),
        'num_peaks_each_rep': num_peaks_each_rep,
        'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep,
        'num_peaks_pooled_pseudo_rep': numPeaks_Rep0
    }
def process(scattered_input, dme_ix, ncpus, reads_root):
    # Fill in code here to process the input and create output.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    dme_ix = dxpy.DXFile(dme_ix)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(dme_ix.get_id(), "index.tgz")
    fq = dxpy.DXFile(scattered_input)
    name = fq.describe()['name']
    dxpy.download_dxfile(fq.get_id(), name)
    bam_root = name + '_techrep'

    logger.info("* === Calling DNAnexus and ENCODE independent script... ===")
    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug(subprocess.check_output(shlex.split('head %s' % name)))
    if os.path.isfile(ALIGN_SCRIPT):
        logger.debug("** Executable %s exists" % ALIGN_SCRIPT)
    else:
        logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT)
        exit(1)
    align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name,
                                                    str(ncpus), bam_root)
    logger.debug('** command line: %s' % align_cmd)
    map_out = subprocess.check_output(shlex.split(align_cmd))
    logger.info("* === Returned from dname_align_se  ===")

    # As always, you can choose not to return output if the
    # "postprocess" stage does not require any input, e.g. rows have
    # been added to a GTable that has been created in advance.  Just
    # make sure that the "postprocess" job does not run until all
    # "process" jobs have finished by making it wait for "map" to
    # finish using the depends_on argument (this is already done for
    # you in the invocation of the "postprocess" job in "main").

    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug("** OUTPUT DIR: %s" % os.listdir('output/'))

    os.rename(bam_root + '_bismark.bam', bam_root + '.bam')
    return {
        "bam_file":
        dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')),
        "report_file":
        dxpy.dxlink(
            dxpy.upload_local_file(bam_root + '_bismark_map_report.txt'))
    }
Beispiel #26
0
    def create_sample_sheet(self, output_folder):
        ''' Description:
        '''

        misc_subfolder = output_folder + '/miscellany'

        command = 'python create_sample_sheet.py -r %s ' % self.run_name
        command += '-t %s ' % self.lims_token
        command += '-u %s ' % self.lims_url
        command += '-b %d ' % int(self.bcl2fastq_version)
        command += '-l %d' % int(self.lane_index)

        stdout, stderr = self.createSubprocess(cmd=command, pipeStdout=True)

        self.sample_sheet = '%s_L%d_samplesheet.csv' % (self.run_name,
                                                        self.lane_index)
        stdout_elements = stdout.split()
        self.sample_sheet = stdout_elements[1]
        print 'This is the self.sample_sheet: %s' % self.sample_sheet

        # DEV: This is a dirty hack. Need to fix issue in LIMS ASAP -PBR 6/6/2016
        if self.seq_instrument not in ['Cooper', 'Gadget']:
            print 'This is not a HiSeq 4000 run: need to RC i5s'
            self.reverse_complement_i5(self.sample_sheet)
        else:
            print 'This is a HiSeq 4000 run; indexes are fine'
            # Reverse complement i5 index keys in barcode_dict
            dual_index = False
            barcode_dict_rci5 = {}
            for key in self.barcode_dict.keys():
                indexes = key.split('-')
                if len(indexes) > 1:
                    dual_index = True
                    index_i7 = indexes[0]
                    index_rci5 = reverse_complement(indexes[1])
                    barcode_rci5 = '-'.join([index_i7, index_rci5])
                    barcode_dict_rci5[barcode_rci5] = self.barcode_dict[key]
            if dual_index == True:
                self.barcode_dict = barcode_dict_rci5

        # DEV: insert check so that samplesheet is only uploaded if does not exist.
        #      Also, maybe add it to output?
        dxpy.upload_local_file(filename=self.sample_sheet,
                               properties=None,
                               project=self.lane_project_id,
                               folder=misc_subfolder,
                               parents=True)
        return self.sample_sheet
def sort_bam(job_inputs):
    input_bam = dxpy.DXFile(job_inputs['input_bam'])
    fn = input_bam.describe()['name']
    dxpy.download_dxfile(input_bam.get_id(), fn)

    # Sort and optionally remove unmapped and multimapped reads
    sorted_ofn = os.path.splitext(fn)[0] + '_sorted.bam'
    cmd = '/sambamba sort -t {0} -o /dev/stdout {1} '.format(
        multiprocessing.cpu_count() - 1, fn)
    if job_inputs['quality_filter']:
        cmd += '| /sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -o /dev/stdout /dev/stdin '
    cmd += '> ' + sorted_ofn
    print cmd
    subprocess.check_call(cmd, shell=True)

    # Count mapped, unique reads.
    cmd = '/sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -c ' + sorted_ofn
    print cmd
    num_uniquely_mapped_reads = int(
        subprocess.check_output(cmd, shell=True).strip())

    pcr_bottleneck_coefficient = calc_pcr_bottleneck_coefficient(sorted_ofn)

    final_ofn = sorted_ofn
    if job_inputs['remove_duplicates']:
        deduped_ofn = os.path.splitext(sorted_ofn)[0] + '_deduped.bam'
        md_metrics_ofn = os.path.splitext(
            sorted_ofn)[0] + '_deduped_metrics.txt'
        cmd = get_java_cmd()
        cmd += ' -jar /MarkDuplicates.jar I={0} O={1} METRICS_FILE={2} ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true '.format(
            sorted_ofn, deduped_ofn, md_metrics_ofn)
        print cmd
        subprocess.check_call(cmd, shell=True)
        bam_file = dxpy.dxlink(dxpy.upload_local_file(deduped_ofn).get_id())
        metrics_file = dxpy.dxlink(
            dxpy.upload_local_file(md_metrics_ofn).get_id())

        final_ofn = deduped_ofn
    else:
        bam_file = dxpy.dxlink(dxpy.upload_local_file(sorted_ofn).get_id())
        metrics_file = None

    return {
        'output_bam': bam_file,
        'dedup_metrics_file': metrics_file,
        'qc_uniquely_mapped_reads': num_uniquely_mapped_reads,
        'qc_pcr_bottleneck_coefficient': pcr_bottleneck_coefficient
    }
def process(scattered_input, dme_ix, ncpus, reads_root):
    # Fill in code here to process the input and create output.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    dme_ix = dxpy.DXFile(dme_ix)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(dme_ix.get_id(), "index.tgz")
    fq = dxpy.DXFile(scattered_input)
    name = fq.describe()['name']
    dxpy.download_dxfile(fq.get_id(), name)
    bam_root = name + '_techrep'

    logger.info("* === Calling DNAnexus and ENCODE independent script... ===")
    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug(subprocess.check_output(shlex.split('head %s' % name)))
    if os.path.isfile(ALIGN_SCRIPT):
        logger.debug("** Executable %s exists" % ALIGN_SCRIPT)
    else:
        logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT)
        exit(1)
    align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root)
    logger.debug('** command line: %s' % align_cmd)
    map_out = subprocess.check_output(shlex.split(align_cmd))
    logger.info("* === Returned from dname_align_se  ===")

    # As always, you can choose not to return output if the
    # "postprocess" stage does not require any input, e.g. rows have
    # been added to a GTable that has been created in advance.  Just
    # make sure that the "postprocess" job does not run until all
    # "process" jobs have finished by making it wait for "map" to
    # finish using the depends_on argument (this is already done for
    # you in the invocation of the "postprocess" job in "main").

    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug("** OUTPUT DIR: %s" % os.listdir('output/'))

    os.rename(bam_root+'_bismark.bam', bam_root+'.bam')
    return {
        "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'.bam')),
        "report_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'_bismark_map_report.txt'))
    }
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype):
    # Download inputs
    reads = [
        dx_utils.download_and_gunzip_file(f, skip_decompress=True)
        for f in reads
    ]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # configure preset params
    if datatype == 'PacBio':
        preset_param = 'map-pb'
    else:
        preset_param = 'map-ont'

    # Iterate over reads files
    output_ofns = []
    for read in reads:
        output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read)
        ofn = '{0}.mapped.bam'.format(output_prefix)
        # Get help info
        dx_utils.run_cmd(['minimap2', '-h'])
        # Call minimap2
        minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read]
        view_cmd = [
            'sambamba', 'view', '--sam-input', '--format=bam',
            '--compression-level=0', '/dev/stdin'
        ]
        sort_cmd = [
            'sambamba', 'sort', '-m',
            '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn,
            '-t',
            str(multiprocessing.cpu_count()), '/dev/stdin'
        ]
        dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd)

        # index
        dx_utils.run_cmd(['sambamba', 'index', ofn])
        # append to outputs
        output_ofns.append(ofn)
    return {
        'mapped_reads':
        [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
        'mapped_reads_index': [
            dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
            for ofn in output_ofns
        ]
    }
def combine_files(countDXlinks, resultfn):
    """The 'gather' subjob of the applet.

    Arguments:
        countDXlinks (list[dict]): list of DXlinks to process job output files.
        resultfn (str): Filename to use for job output file.

    Returns:
        DXLink for the main function to return as the job output.

    Note: Only the DXLinks are passed as parameters.
    Subjobs work on a fresh instance so files must be downloaded to the machine
    """
    if resultfn.endswith(".bam"):
        resultfn = resultfn[:-4] + '.txt'

    sum_reads = 0
    with open(resultfn, 'w') as f:
        for i, dxlink in enumerate(countDXlinks):
            dxfile = dxpy.DXFile(dxlink)
            filename = "countfile{0}".format(i)
            dxpy.download_dxfile(dxfile, filename)
            with open(filename, 'r') as fsub:
                for line in fsub:
                    sum_reads += parse_line_for_readcount(line)
                    f.write(line)
        f.write('Total Reads: {0}'.format(sum_reads))

    countDXFile = dxpy.upload_local_file(resultfn)
    countDXlink = dxpy.dxlink(countDXFile.get_id())

    return {"countDXLink": countDXlink}
Beispiel #31
0
def main(fastq, genomeindex_targz):

    print "something else"
    fastq_dxfile = dxpy.DXFile(fastq)
    dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq")

    genome_dxfile = dxpy.DXFile(genomeindex_targz)
    dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz")
    os.makedirs("genome")
    tar_cmd = "tar xzvf genome.tar.gz -C genome"
    subprocess.check_call(tar_cmd, shell=True)
    genome_file = glob.glob("genome/*.bwt")[0]
    genome_file = re.sub("\.bwt$", "", genome_file)

    bwa_cmd = ("bwa mem -t {nproc} {genome} {fastq} | "
               "samtools view -u -S - | "
               "samtools sort -m 256M -@ {nproc} - output".format(
                   nproc=multiprocessing.cpu_count(),
                   genome=genome_file,
                   fastq="input.fastq"))
    subprocess.check_call(bwa_cmd, shell=True)

    bam = dxpy.upload_local_file("output.bam")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["bam"] = dxpy.dxlink(bam)

    return output
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'):

    inputFiles = []
    for i in range(len(BAMs)):
        fh = dxpy.DXFile(BAMs[i])
        dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i))

    name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam")

    # Fill in your application code here.

    command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params)
    for i in range(len(BAMs)):
        command += " INPUT=input%d.bam" % (i)
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    BAM = dxpy.upload_local_file("%s.bam" % name);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["BAM"] = dxpy.dxlink(BAM)

    return output
Beispiel #33
0
    def run_applet_with_flags(self, flag_list, num_files, file_size_bytes):
        with temporary_project(
                'TestDXBashHelpers.test_app1 temporary project') as dxproj:
            env = update_environ(DX_PROJECT_CONTEXT_ID=dxproj.get_id())

            # Upload file
            self.create_file_of_size("A.txt", file_size_bytes)
            remote_file = dxpy.upload_local_file(filename="A.txt",
                                                 project=dxproj.get_id(),
                                                 folder='/')

            # Build the applet, patching in the bash helpers from the
            # local checkout
            applet_id = build_app_with_bash_helpers(
                os.path.join(TEST_APPS, 'benchmark'), dxproj.get_id())

            # Add several files to the output
            applet_args = []
            applet_args.extend(['-iref=A.txt'] * num_files)
            cmd_args = [
                'dx', 'run', '--yes', '--watch',
                '--instance-type=mem1_ssd1_x2', applet_id
            ]
            cmd_args.extend(applet_args)
            cmd_args.extend(flag_list)
            run(cmd_args, env=env)
Beispiel #34
0
def geneBody_coverage(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    # split mappings into chunks that can be done on a single worker
    # all mappings are loaded into RAM so can only do 5 million at a time
    run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"]))
    run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"]))
    files = os.listdir(".")
    jobs = []
    for f in files:
        if f.startswith("split_map"):
            # add header 
            run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"]))
            # convert to BAM
            run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"]))
            # upload file
            split_bam = dxpy.upload_local_file("temp.bam")
            # run analysis
            jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc"))
            
    run_shell( "ls -l" )

    gbc_agg_input = {"sub_reports":[]}
    for j in jobs:
        gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"})

    agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id()
    
    return {"results":{"job":agg_job, "field":"cover"}}
def main(input_file):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_file = dxpy.DXFile(input_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_file.get_id(), "input_file")

    # Fill in your application code here.

    subprocess.check_call(
        "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file",
        shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output_file = dxpy.upload_local_file("output_file")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["output_file"] = dxpy.dxlink(output_file)

    return output
def scatter(orig_reads, split_size):
    # Fill in code here to do whatever is necessary to scatter the
    # input.
    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    splitsize = split_size * 1000000 * 4
    # each FQ read is 4 lines
    os.mkdir('splits')

    for f in orig_reads:
        reads_filename = dxpy.describe(f)['name']
        reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS)
        dxpy.download_dxfile(dxpy.DXFile(f).get_id(), reads_filename)

        reads_root_name = simplify_name() or reads_basename

        logger.info('* RUNNING /bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name))
        split_out = subprocess.check_output('/bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name), shell=True)
        # can't shlex because of |

    logger.info(split_out)
    splits = os.listdir('splits')
    logger.info("* Return from scatter: %s *" % splits)

    # SHould we gzip here?
    return {
        "array_of_scattered_input": [ 
            dxpy.dxlink(dxpy.upload_local_file('splits/' + split_file)) for split_file in splits]
        }
def make_indexed_reference( ref_ID ):

    run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID)
    ref_details = dxpy.DXRecord(ref_ID).get_details()
    ref_name = dxpy.DXRecord(ref_ID).describe()['name']

    # call bowtie2-build
    run_shell("bowtie2-build reference.fasta indexed_ref")
    # package it into an archive for uploading
    run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*")

    indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True)

    indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for Bowtie2)",
                                           types=["BowtieLetterContigSetV2"],
                                           details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()),
                                                    'original_contigset': dxpy.dxlink(ref_ID)})
    indexed_ref_record.close()

    '''
    # TODO: dxpy project workspace convenience functions
    if "projectWorkspace" in job:
        indexed_ref_record.clone(job["projectWorkspace"])
    '''

    return indexed_ref_record.get_id()
Beispiel #38
0
def main(inputs, prefix=None):

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = common.run_pipe([
        'gzip -dc %s' % (' '.join(input_filenames)),
        'gzip -cn'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    output = {
        "pooled": dxpy.dxlink(pooled)
    }

    return output
Beispiel #39
0
def main(contig_set):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    contig_set = dxpy.DXRecord(contig_set)

    # The following line extracts the name from the file object so that
    # outputs can be named intelligently. It is not automatically generated by
    # the app wizard.

    name = contig_set.describe()['name'].replace(".fa", "")

    # Fill in your application code here.

    subprocess.check_call("dx-contigset-to-fasta %s %s.fa" % (contig_set.get_id(), name), shell=True)
    subprocess.check_call("gzip %s.fa" % name, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    fasta_gz = dxpy.upload_local_file("%s.fa.gz" % name);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["fasta_gz"] = dxpy.dxlink(fasta_gz)

    return output
def main(quants_a, quants_b):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b")

    # Create and appropriate name for output files
    out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0])
    mad_plot_file = out_root + '_mad_plot.png'
        
    # DX/ENCODE independent script is found in resources/usr/bin
    print "* Runnning MAD.R..."
    mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b'])
    subprocess.check_call(['mv', "MAplot.png", mad_plot_file])
    
    print "* package properties..."
    qc_metrics = {}
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
Beispiel #41
0
def main(psmcfa, psmc, outname, xchr, timemax, window):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    psmcfa = dxpy.DXFile(psmcfa)
    psmc = dxpy.DXFile(psmc)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(psmcfa.get_id(), "psmcfa")
    dxpy.download_dxfile(psmc.get_id(), "psmc")

    # Fill in your application code here.
    (tmaxNew, parfile) = writeRecalFile('psmc', timemax, window, xchr)
    subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname, 'psmcfa'])
    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    outfile = dxpy.upload_local_file(outname);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["outfile"] = dxpy.dxlink(outfile)

    return output
def produce_qc_report(individual_json_outputs,
                      sample_name,
                      output_project,
                      output_folder,
                      properties={}):
    """Combine the various statistics collected into a single dict for
    output."""

    output = {'Sample name': sample_name}
    misc_subfolder = output_folder + '/miscellany'

    for j in individual_json_outputs:
        for k in j:
            if k in output:
                output[k].update(j[k])
            else:
                output[k] = j[k]

    ofn = sample_name + '_stats.json'
    with open(ofn, 'w') as output_fh:
        output_fh.write(json.dumps(output))

    properties['file_type'] = 'qc_stats'
    output_json_file = dxpy.upload_local_file(filename=ofn,
                                              project=output_project,
                                              properties=properties,
                                              folder=misc_subfolder,
                                              parents=True)

    return {'combined_json_file': dxpy.dxlink(output_json_file)}
Beispiel #43
0
def main(fastq, genomeindex_targz):

    print "something else"
    fastq_dxfile = dxpy.DXFile(fastq)
    dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq")

    genome_dxfile = dxpy.DXFile(genomeindex_targz)
    dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz")
    os.makedirs("genome")
    tar_cmd = "tar xzvf genome.tar.gz -C genome"
    subprocess.check_call(tar_cmd, shell=True)
    genome_file = glob.glob("genome/*.bwt")[0]
    genome_file = re.sub("\.bwt$", "", genome_file)

    bwa_cmd = (
        "bwa mem -t {nproc} {genome} {fastq} | "
        "samtools view -u -S - | "
        "samtools sort -m 256M -@ {nproc} - output".format(
            nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq"
        )
    )
    subprocess.check_call(bwa_cmd, shell=True)

    bam = dxpy.upload_local_file("output.bam")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["bam"] = dxpy.dxlink(bam)

    return output
def calc_mismatch_per_cycle_stats(bam_file,
                                  aligner,
                                  output_project,
                                  output_folder,
                                  properties={}):
    logger = []
    misc_subfolder = output_folder + '/miscellany'

    bam_file = dxpy.DXFile(bam_file)
    bam_filename = bam_file.describe()['name']
    dxpy.download_dxfile(bam_file.get_id(), bam_filename)
    ofn = os.path.splitext(bam_filename)[0] + '.mm_stats'

    # Change permissions
    cmd = 'chmod +x /bwa_mismatches'
    run_cmd(cmd, logger)
    cmd = '/bwa_mismatches -o {0} -m {1} {2}'.format(ofn, ALIGNERS[aligner],
                                                     bam_filename)
    run_cmd(cmd, logger)

    properties['file_type'] = 'mismatch_stats'
    mismatch_per_cycle_stats = dxpy.upload_local_file(filename=ofn,
                                                      project=output_project,
                                                      folder=misc_subfolder,
                                                      properties=properties,
                                                      parents=True)

    return {
        'mismatch_per_cycle_stats': mismatch_per_cycle_stats,
        "tools_used": logger
    }
def main(quants_a, quants_b):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '-a', APP_SCRIPT, '-av', APP_VER])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b")

    print "* Runnning MAD.R..."
    mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b'])
    quants_a_name = dxfile_a.name.split('.')
    quants_b_name = dxfile_b.name.split('.')
    filename = quants_a_name[0] + '_' + quants_b_name[0] + '_' + quants_a_name[1] + '_mad_plot.png'
    subprocess.check_call(['mv', "MAplot.png", filename])
    
    print "* package properties..."
    qc_metrics = {}
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(filename,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
def create_tools_used_json_file(tools_used,
                                output_project,
                                output_folder,
                                properties={}):

    misc_subfolder = output_folder + '/miscellany'

    tools_used_dict = {}
    tools_used_dict['name'] = get_app_title()
    tools_used_dict['commands'] = []

    for tools in tools_used:
        tools_used_dict['commands'] += tools

    fn = tools_used_dict['name'] + '_tools_used.json'
    with open(fn, 'w') as fh:
        fh.write(json.dumps(tools_used_dict))

    properties['file_type'] = 'tools_used'
    tools_used_json_file = dxpy.upload_local_file(filename=fn,
                                                  project=output_project,
                                                  folder=misc_subfolder,
                                                  properties=properties,
                                                  parents=True)

    return {'tools_used_json_file': tools_used_json_file}
Beispiel #47
0
def read_duplication(BAM_file):
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    run_shell( " ".join(["read_duplication.py", "-i mappings.bam", "-o read_dup"]))
    run_shell( " ".join(["cat", "read_dup.pos.DupRate.xls", "read_dup.seq.DupRate.xls", ">", "read_dup.txt"]))
    results_id = dxpy.upload_local_file("read_dup.txt", wait_on_close=True).get_id()
    return {"results":results_id}
Beispiel #48
0
def combine_files(countDXlinks, resultfn):
    """The 'gather' subjob of the applet.

    Arguments:
        countDXlinks (list[dict]): list of DXlinks to process job output files.
        resultfn (str): Filename to use for job output file.

    Returns:
        DXLink for the main function to return as the job output.

    Note: Only the DXLinks are passed as parameters.
    Subjobs work on a fresh instance so files must be downloaded to the machine
    """
    if resultfn.endswith(".bam"):
        resultfn = resultfn[:-4] + '.txt'

    sum_reads = 0
    with open(resultfn, 'w') as f:
        for i, dxlink in enumerate(countDXlinks):
            dxfile = dxpy.DXFile(dxlink)
            filename = "countfile{0}".format(i)
            dxpy.download_dxfile(dxfile, filename)
            with open(filename, 'r') as fsub:
                for line in fsub:
                    sum_reads += parse_line_for_readcount(line)
                    f.write(line)
        f.write('Total Reads: {0}'.format(sum_reads))

    countDXFile = dxpy.upload_local_file(resultfn)
    countDXlink = dxpy.dxlink(countDXFile.get_id())

    return {"countDXLink": countDXlink}
def main(
    BAMs,
    params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'
):

    inputFiles = []
    for i in range(len(BAMs)):
        fh = dxpy.DXFile(BAMs[i])
        dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i))

    name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam")

    # Fill in your application code here.

    command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (
        name, params)
    for i in range(len(BAMs)):
        command += " INPUT=input%d.bam" % (i)
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    BAM = dxpy.upload_local_file("%s.bam" % name)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["BAM"] = dxpy.dxlink(BAM)

    return output
def main(input_file):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_file = dxpy.DXFile(input_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_file.get_id(), "input_file")

    # Fill in your application code here.

    subprocess.check_call("fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output_file = dxpy.upload_local_file("output_file");

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["output_file"] = dxpy.dxlink(output_file)

    return output
Beispiel #51
0
    def _format_data_file(self, df: DataFile) -> dict:
        if isinstance(df.localizer, UrlLocalizer):
            ul = cast(UrlLocalizer, df.localizer)
            if ul.url.startswith("dx://"):
                return dxpy.dxlink(*ul.url[5:].split(":"))

        file_name = df.local_path.name

        existing_files = list(dxpy.find_data_objects(
            classname="file",
            state="closed",
            name=file_name,
            project=self._project_id,
            folder=self._folder,
            recurse=False
        ))

        if not existing_files:
            # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for
            #  parallelization
            return dxpy.dxlink(dxpy.upload_local_file(
                str(df.path),
                name=file_name,
                project=self._project_id,
                folder=self._folder,
                parents=True,
                wait_on_close=True
            ))
        elif len(existing_files) == 1:
            return dxpy.dxlink(existing_files[0]["id"], self._project_id)
        else:
            raise RuntimeError(
                f"Multiple files with name {file_name} found in "
                f"{self._project_id}:{self._folder}"
            )
Beispiel #52
0
    def upload_lane_html(self, raw_properties, tags):
        '''Upload lane.html file to DNAnexus project.
        
        Args:
            local_file_path (str): Local path of sample sheet.
            raw_properties (dict): Properties with values of different types.

        Returns:
            str: DXLink to lane.html file on DNAnexus object store.

        '''

        # Convert all property values to strings
        properties = {key: str(value) for key, value in raw_properties.items()}
        properties['file_type'] = 'lane_html'

        project_folder = '{}/miscellany'.format(self.project_path)

        local_file_path = (
            '{}/Reports/html/'.format(LOCAL_OUTPUT) +
            '{}/all/all/all/lane.html'.format(properties['flowcell_id']))
        remote_file_name = '{}_L{}.lane.html'.format(properties['run_name'],
                                                     properties['lane_index'])
        lane_html_dxid = dxpy.upload_local_file(filename=local_file_path,
                                                name=remote_file_name,
                                                properties=properties,
                                                tags=tags,
                                                project=self.project_dxid,
                                                folder=project_folder,
                                                parents=True)
        return dxpy.dxlink(lane_html_dxid)
Beispiel #53
0
def main(inputs):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    extension = splitext(splitext(input_filenames[-1])[0])[
        1]  #uses last extension - presumably they are all the same
    pooled_filename = '-'.join(
        [splitext(splitext(fn)[0])[0]
         for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = run_pipe(
        ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["pooled"] = dxpy.dxlink(pooled)

    return output
Beispiel #54
0
    def upload_tools_used(self, tools_used_dict, raw_properties):
        '''Write console commands to Tools Used file & upload.

        Args:
            tools_used_dict (dict): Description of executables & configurations.
            raw_properties (dict): Properties with values of different types.

        Returns:
            str: DXLink to "tools used" file on DNAnexus object store.
        '''

        # Convert all property values to strings
        properties = {key: str(value) for key, value in raw_properties.items()}
        properties['file_type'] = 'lane_html'

        # Write file
        local_file_path = 'bcl2fastq_tools_used.json'
        with open(local_file_path, 'w') as TOOLS:
            TOOLS.write(json.dumps(tools_used_dict))

        # Upload file
        properties['file_type'] = 'tools_used'
        project_folder = '{}/miscellany'.format(self.project_path)
        tools_used_dxid = dxpy.upload_local_file(filename=local_file_path,
                                                 properties=properties,
                                                 project=self.project_dxid,
                                                 folder=project_folder,
                                                 parents=True)
        return dxpy.dxlink(tools_used_dxid)
Beispiel #55
0
def postprocess(**inputs):
    kwargs = inputs["kwargs"]
    subjob_outputs = inputs["subjob_outputs"] 
    print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs))

    output_prefix = kwargs["output_prefix"]
    variant_suffixes = kwargs["variant_suffixes"]
    
    app_output_fn = {}
    for subjob_output in subjob_outputs:
        for type, id in subjob_output.iteritems():
            file_id = id["$dnanexus_link"]
            filename = output_prefix + "_" + variant_suffixes[type]
            
            print "Downloading " + str(file_id) + " into " + filename
            dxpy.download_dxfile(dxid=file_id, filename=filename, append=True)
            app_output_fn[type] = filename

    postprocess_outputs = {}
    need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"]
    for type, fn in app_output_fn.iteritems():
        out_fn = fn
        if type in need_to_renumber:
            out_fn = RenumberMergedOutput(fn, fn+"_renumbered") 
        print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn)
        postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn))

    if kwargs["export_vcf"]:
        DownloadRefFasta(kwargs["reference_fasta"])
        postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") 
        
    return postprocess_outputs
Beispiel #56
0
def s3cp(accession, key=None):

    (AUTHID, AUTHPW, SERVER) = common.processkey(key, KEYFILE)
    keypair = (AUTHID, AUTHPW)

    url = SERVER + '/search/?type=file&accession=%s&format=json&frame=embedded&limit=all' % (
        accession)
    #get the file object
    response = common.encoded_get(url, keypair)
    logger.debug(response)

    #select your file
    result = response.get('@graph')
    if not result:
        logger.error('Failed to find %s at %s' % (accession, url))
        return None
    else:
        f_obj = result[0]
        logger.debug(f_obj)

    #make the URL that will get redirected - get it from the file object's href property
    encode_url = urlparse.urljoin(SERVER, f_obj.get('href'))
    logger.debug("URL: %s" % (encode_url))
    logger.debug("%s:%s" % (AUTHID, AUTHPW))
    #stream=True avoids actually downloading the file, but it evaluates the redirection
    r = requests.get(encode_url,
                     auth=(AUTHID, AUTHPW),
                     headers={'content-type': 'application/json'},
                     allow_redirects=True,
                     stream=True)
    try:
        r.raise_for_status
    except:
        logger.error('%s href does not resolve' % (f_obj.get('accession')))
    logger.debug("Response: %s", (r))

    #this is the actual S3 https URL after redirection
    s3_url = r.url
    logger.debug(s3_url)

    #release the connection
    r.close()

    #split up the url into components
    o = urlparse.urlparse(s3_url)

    #pull out the filename
    filename = os.path.basename(o.path)

    #hack together the s3 cp url (with the s3 method instead of https)
    bucket_url = S3_SERVER.rstrip('/') + o.path

    #cp the file from the bucket
    subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' % (bucket_url)),
                          stderr=subprocess.STDOUT)
    subprocess.check_call(shlex.split('ls -l %s' % (filename)))

    dx_file = dxpy.upload_local_file(filename)

    return dx_file