def main(**job_inputs):
    input_vcfs = [
        dx_utils.download_and_gunzip_file(f, skip_decompress=True)
        for f in job_inputs['input_vcfs']
    ]
    input_ref = dx_utils.download_and_gunzip_file(job_inputs['ref_fasta'])

    # create index files for inputs
    dx_utils.run_cmd(['samtools', 'faidx', input_ref])
    map(dx_utils.run_cmd, ['tabix {0}'.format(vcf) for vcf in input_vcfs])
    with open(VCF_FOFN, 'w') as fh:
        fh.write('\n'.join(input_vcfs))

    # get the bcftools version and help doc
    cmd = ['bcftools', '--help']
    dx_utils.run_cmd(cmd)

    output_prefix = job_inputs.get('output_prefix', '')
    output_bcf = output_prefix + 'concat' + '.bcf'
    # concatenate the bcf/vcf files
    concat_cmd = ['bcftools', 'concat', '-f', VCF_FOFN]
    view_cmd = ['bcftools', 'view', '-Ou', '-e\'type="ref"\'']
    norm_cmd = [
        'bcftools', 'norm', '-Ob', '-f', input_ref, '-o', output_bcf,
        '--threads={0}'.format(multiprocessing.cpu_count())
    ]
    # run the commands
    dx_utils.run_pipe(concat_cmd, view_cmd, norm_cmd)

    # index the concatenated bcf file
    dx_utils.run_cmd(['bcftools', 'index', output_bcf])

    # call consensus
    output_fasta = output_prefix + 'consensus.fasta'
    consensus_filter = 'QUAL>1 && (GT="AA" || GT="Aa")'
    consensus_cmd = [
        'bcftools', 'consensus', '-i', consensus_filter, '-Hla', '-f',
        input_ref, output_bcf
    ]
    dx_utils.run_pipe(consensus_cmd, outputFile=output_fasta)

    # save the changes to vcf
    output_vcf = output_prefix + 'changes.vcf.gz'
    vcf_cmd = [
        'bcftools', 'view', '-i', consensus_filter, '-Oz',
        '--threads={0}'.format(multiprocessing.cpu_count()), output_bcf
    ]
    dx_utils.run_pipe(vcf_cmd, outputFile=output_vcf)

    consensus_link = dx_utils.gzip_and_upload(output_fasta)
    print(consensus_link)
    output = {}
    output['consensus_fasta'] = consensus_link
    output['consensus_vcf'] = dxpy.dxlink(dxpy.upload_local_file(output_vcf))

    return output
Exemple #2
0
def _run_meryl(output, sequences, k_mer_size,is10x):

    dx_utils.run_cmd(["mkdir","unuse"])
    dx_utils.run_cmd(["mv", "/usr/src/canu", "/home/dnanexus/unuse"])
    dx_utils.run_cmd(["chmod","777","meryl"])
    dx_utils.run_cmd(["./meryl","--version"])
    #dx_utils.run_cmd(["./meryl"])
    mem_in_gb=dx_utils.run_cmd("head -n1 /proc/meminfo | awk '{print int($2*0.6/1024/1024)}'",returnOutput=True)
    print('mem in GB',mem_in_gb)
    dx_utils.run_cmd('ulimit -Sn 32000')

    meryl_kmer = ["./meryl", "threads={}".format(multiprocessing.cpu_count()), "k={}".format(k_mer_size),"memory={}".format(mem_in_gb)]

    for file_ref in sequences:

        dx_utils.download_and_gunzip_file(file_ref)
        each_file_name=dxpy.describe(file_ref)["name"].replace(".gz",'')
        print('processing {0}'.format(each_file_name))
        each_file_prefix=each_file_name.replace('fastq','')
        each_file_prefix=each_file_prefix.replace('fq','')
        if 'R1' in each_file_name:
            if is10x:
                print('10x trimming')
                dx_utils.run_cmd(
                    "cat "+ each_file_name +" | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}'| split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/"+each_file_prefix)
            else:
                print('no trimming')
                dx_utils.run_cmd(
                    "cat {0} | split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/{1}".format(each_file_name, each_file_prefix))
        else:
            print('no trimming')
            dx_utils.run_cmd(
                "cat {0} | split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/{1}".format(each_file_name, each_file_prefix))

        #meryl_kmer.extend([dxpy.describe(file_ref)["name"].replace(".gz",'')])
    for split_file in os.listdir('/home/dnanexus/split'):
        split_file_prefix=split_file.replace('.fq','')
        command=meryl_kmer+["count","split/"+split_file,"output","split/"+split_file_prefix+'.meryl']
        dx_utils.run_cmd(command)
    os.listdir('/home/dnanexus/split')
    folder_string = subprocess.check_output('ls -d split/*/',shell=True)
    folder_list = folder_string.strip().split('\n')
    folder_list = map(lambda x: x[:-1], folder_list)
    meryl_union_sum=["meryl","threads={}".format(multiprocessing.cpu_count()), "k={}".format(k_mer_size),"memory={}".format(mem_in_gb),"union-sum","output","output"]
    meryl_union_sum.extend(folder_list)
    dx_utils.run_cmd(meryl_union_sum)

    with open("mer_counts.tsv",'w') as kmers_index:
        
        subprocess.check_call(["./meryl", "histogram", "output"],shell=False,stdout=kmers_index)

    output["histogram"] = dxpy.dxlink(dxpy.upload_local_file("mer_counts.tsv"))
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype):
    # Download inputs
    reads = [
        dx_utils.download_and_gunzip_file(f, skip_decompress=True)
        for f in reads
    ]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # configure preset params
    if datatype == 'PacBio':
        preset_param = 'map-pb'
    else:
        preset_param = 'map-ont'

    # Iterate over reads files
    output_ofns = []
    for read in reads:
        output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read)
        ofn = '{0}.mapped.bam'.format(output_prefix)
        # Get help info
        dx_utils.run_cmd(['minimap2', '-h'])
        # Call minimap2
        minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read]
        view_cmd = [
            'sambamba', 'view', '--sam-input', '--format=bam',
            '--compression-level=0', '/dev/stdin'
        ]
        sort_cmd = [
            'sambamba', 'sort', '-m',
            '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn,
            '-t',
            str(multiprocessing.cpu_count()), '/dev/stdin'
        ]
        dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd)

        # index
        dx_utils.run_cmd(['sambamba', 'index', ofn])
        # append to outputs
        output_ofns.append(ofn)
    return {
        'mapped_reads':
        [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
        'mapped_reads_index': [
            dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
            for ofn in output_ofns
        ]
    }
def run_minimap_index(genome_fastagz):
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ofn = os.path.splitext(ref_genome)[0] + '.mmi'

    minimap2_indx_cmd = ["minimap2", "-d", ofn, ref_genome]
    dx_utils.run_cmd(minimap2_indx_cmd)

    return {'genome_mmi': dxpy.dxlink(dxpy.upload_local_file(ofn))}
def run_bwa(reads, ref_genome):
    dx_utils.download_and_gunzip_file(ref_genome)
    bwt_filename = dx_utils.run_cmd('ls *.bwt', returnOutput=True).strip()
    genome_prefix = os.path.splitext(bwt_filename)[0]

    reads = dx_utils.download_and_gunzip_file(reads,
                                              skip_decompress=True,
                                              create_named_pipe=True)
    ofn = re.sub('.fastq[.gz]*$', '.bam', reads)

    cmd = 'set -e -o pipefail; bwa mem -t {0} -B 8 {1} {2} '.format(
        multiprocessing.cpu_count(), genome_prefix, reads)
    cmd += '| perl ./filter_five_end.pl | samtools view -@{0} -Sb - > {1} '.format(
        multiprocessing.cpu_count(), ofn)
    dx_utils.run_cmd(cmd)

    return {'output_bam': dxpy.dxlink(dxpy.upload_local_file(ofn))}
def combine_bams(fwd_bam, rev_bam):
    fwd_bam = dx_utils.download_and_gunzip_file(fwd_bam,
                                                create_named_pipe=True)
    rev_bam = dx_utils.download_and_gunzip_file(rev_bam,
                                                create_named_pipe=True)

    ofn = re.sub('.bam$', '.combined.bam', fwd_bam)

    #cmd = 'perl two_read_bam_combiner.pl {0} {1} | samtools view -@{2} -Sb - | samtools sort -o {3} - '
    cmd = 'set -e -o pipefail; perl two_read_bam_combiner.pl {0} {1} | /opt/biobambam2/bin/bamsormadup inputformat=sam indexfilename={2}.bai > {2} '
    cmd = cmd.format(fwd_bam, rev_bam, ofn)
    dx_utils.run_cmd(cmd)

    return {
        'output_bam':
        dxpy.dxlink(dxpy.upload_local_file(ofn, name=fwd_bam)),
        'output_bai':
        dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai',
                                           name=fwd_bam + '.bai'))
    }
Exemple #7
0
def run_minimap_index(genome_fastagz):
    # load the docker images
    dx_utils.run_cmd(['docker', 'load', '-i', '/opt/minimap2_images.tar'])

    # download the reference genome
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ofn = os.path.splitext(ref_genome)[0] + '.mmi'

    # run minimap2 index
    docker_cmd = [
        'docker', 'run', '-v', '/home/dnanexus:/home/dnanexus', '-w',
        '/home/dnanexus'
    ]
    minimap2_indx_cmd = docker_cmd + [
        'quay.io/biocontainers/minimap2:2.17--h84994c4_0'
    ]
    minimap2_indx_cmd += ["minimap2", "-d", ofn, ref_genome]
    dx_utils.run_cmd(minimap2_indx_cmd)

    return {'genome_mmi': dxpy.dxlink(dxpy.upload_local_file(ofn))}
Exemple #8
0
def main(input_assembly,
         hic_alignments,
         restriction_enzyme_bases,
         filter_alignments,
         input_assembly_graph=None):
    # make sure we can run salsa
    dx_utils.run_cmd("python /opt/SALSA/run_pipeline.py -h")

    input_assembly = dx_utils.download_and_gunzip_file(input_assembly)
    alignment_prefix = input_assembly.split(".fasta")[0]

    # process inputs and convert to bed
    first_file = True
    for bam_file in hic_alignments:
        fn = dxpy.describe(bam_file['$dnanexus_link'])['name']
        cmd = 'dx cat {0}'.format(bam_file['$dnanexus_link'])
        prefix, suffix = os.path.splitext(fn)
        if suffix == '.gz':
            cmd += '| gunzip '
            fn = prefix
        cmd += '| bedtools bamtobed -i stdin'
        if first_file:
            cmd += ' > {0}.bed'.format(alignment_prefix)
            first_file = False
        else:
            cmd += ' >> {0}.bed'.format(alignment_prefix)
        dx_utils.run_cmd(cmd)

    # index the ref
    cmd = 'samtools faidx {0} '.format(input_assembly)
    dx_utils.run_cmd(cmd)

    # if we were asked to filter by contig names, make a bed file and subset the input bed
    if filter_alignments == True:
        f = open('%s.fai' % (input_assembly))
        o = open("%s.contigs.bed" % (alignment_prefix), 'w')
        for line in f:
            line = line.strip().split()
            o.write("%s\t1\t%s\n" % (line[0], line[1]))

        f.close()
        o.close()

        cmd = 'bedtools intersect -wa -a {0}.bed -b {0}.contigs.bed > {0}.filtered.bed'.format(
            alignment_prefix)
        dx_utils.run_cmd(cmd)
    else:
        cmd = 'ln -s {0}.bed {0}.filtered.bed'.format(alignment_prefix)
        dx_utils.run_cmd(cmd)

    # now sort the bed file
    cmd = "sort -T . -k4 {0}.filtered.bed > {0}.sorted.bed".format(
        alignment_prefix)
    dx_utils.run_cmd(cmd)

    cmd = 'python /opt/SALSA/run_pipeline.py -a {0} -b {1}.sorted.bed -l {0}.fai -o {2} -e {3} -m yes -p yes '
    cmd = cmd.format(input_assembly, alignment_prefix, './',
                     ','.join(restriction_enzyme_bases))
    if input_assembly_graph is not None:
        cmd = "%s -g input_assembly_graph" % (cmd)

    dx_utils.run_cmd(cmd)

    output = {}

    # final scaffold
    final_fasta = glob.glob('scaffold*FINAL.fasta')[0]
    output['final_scaffold_fasta'] = dx_utils.gzip_and_upload(final_fasta)

    # final agp
    final_agp = glob.glob('scaffold*FINAL.agp')[0]
    output['final_scaffold_agp'] = dx_utils.gzip_and_upload(final_agp)

    # alignment_iteration_1 bed
    alignment_iteration_1 = glob.glob('alignment_iteration_1.bed')[0]
    output['alignment_iteration_1'] = dx_utils.gzip_and_upload(
        alignment_iteration_1)

    # scaffold_length_iteration_1
    scaffold_length_iteration_1 = glob.glob('scaffold_length_iteration_1')[0]
    output['scaffold_length_iteration_1'] = dx_utils.gzip_and_upload(
        scaffold_length_iteration_1)

    # all others
    files = glob.glob('scaffold*fasta')
    files.extend(glob.glob('scaffold*agp'))
    if final_fasta in files:
        files.remove(final_fasta)
    if final_agp in files:
        files.remove(final_agp)
    print files

    output['scaffold'] = dx_utils.tar_files_and_upload(files, alignment_prefix)

    return output
def map_reads_pbmm2(bam_files, pbi_files, genome_fastagz, genome_mmi):
    # Download inputs
    reads = [dx_utils.download_and_gunzip_file(f) for f in bam_files]
    pbis = [dx_utils.download_and_gunzip_file(f) for f in pbi_files]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # find out what environment we're dealing with
    print(os.environ)

    # use environment to set path
    pbmm2_env = {
        "PATH": os.environ['PATH'] + os.pathsep + '/anaconda/bin/',
        "SHELL": '/bin/bash',
        "USER": '******'
    }

    # create bam dataset
    with dx_utils.set_env(**pbmm2_env):
        # Iterate over bam files
        output_ofns = []
        for bam in reads:
            prefix = re.sub("(\.subreads)?(\.bam){1}$", "", bam)
            ofn = '{0}.mapped.bam'.format(prefix)
            if bam + '.pbi' not in pbis:
                dx_utils.run_cmd('pbindex {0}'.format(bam))

            dx_utils.run_cmd(['pbmm2', 'align', '--help'])
            # Call minimap2
            # compute memory per sorting thread
            system_memory = dx_utils.get_memory(suffix='G') - 40
            memory_per_thread = system_memory / SORT_THREADS
            pbmm2_cmd = [
                '/anaconda/bin/pbmm2', 'align',
                str(ref_genome_mmi),
                str(bam),
                str(ofn), '-j',
                str(MAP_THREADS), '--sort', '-J',
                str(SORT_THREADS), '-m', '{0}G'.format(int(memory_per_thread)),
                '--log-level', 'DEBUG'
            ]

            # there's some env variable that is causing pbmm2 to misbehave
            # when run from a python shell. Therefore, write the command to a
            # temp .sh file and execute it that way.
            tmp_cmd = tempfile.NamedTemporaryFile(mode='w',
                                                  suffix='.sh',
                                                  delete=False)
            tmp_cmd.write(' '.join(pbmm2_cmd))
            tmp_cmd.close()
            print('Executing: {0}'.format(' '.join(pbmm2_cmd)))
            dx_utils.run_cmd(['sudo', 'bash', tmp_cmd.name])

            # Create index
            cmd = ['samtools', 'index', ofn]
            dx_utils.run_cmd(cmd)

            # append to outputs
            output_ofns.append(ofn)
    return {
        'mapped_reads':
        [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
        'mapped_reads_index': [
            dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
            for ofn in output_ofns
        ]
    }
Exemple #10
0
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype):
    # load the docker images
    dx_utils.run_cmd(['docker', 'load', '-i', '/opt/minimap2_images.tar'])

    # Download inputs
    reads = [
        dx_utils.download_and_gunzip_file(f, skip_decompress=True)
        for f in reads
    ]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # configure preset params
    if datatype == 'PacBio':
        preset_param = 'map-pb'
    elif datatype == 'CCS':
        preset_param = 'asm20'
    else:
        preset_param = 'map-ont'

    docker_cmd = [
        'docker', 'run', '-v', '/home/dnanexus:/home/dnanexus', '-w',
        '/home/dnanexus'
    ]
    minimap2_docker_cmd = docker_cmd + [
        'quay.io/biocontainers/minimap2:2.17--h84994c4_0'
    ]
    # Run sambamba with -i flag, meaning it can read from stdin
    sambamba_docker_cmd = docker_cmd + [
        '-i', 'quay.io/biocontainers/sambamba:0.6.8--h682856c_1'
    ]
    # Iterate over reads files
    output_ofns = []
    for read in reads:
        output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read)
        ofn = '{0}.mapped.bam'.format(output_prefix)

        # Call minimap2
        minimap2_cmd = minimap2_docker_cmd + [
            'minimap2', '-ax', preset_param, ref_genome_mmi, read
        ]
        view_cmd = sambamba_docker_cmd + [
            'sambamba', 'view', '--sam-input', '--format=bam',
            '--compression-level=0', '/dev/stdin'
        ]
        sort_cmd = sambamba_docker_cmd + [
            'sambamba', 'sort', '-m', '{0}G'.format(
                int(dx_utils.get_memory(suffix='G'))), '-o', ofn, '-t',
            str(multiprocessing.cpu_count()), '/dev/stdin'
        ]
        dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd)

        # index
        dx_utils.run_cmd(sambamba_docker_cmd + ['sambamba', 'index', ofn])
        # append to outputs
        output_ofns.append(ofn)
    return {
        'mapped_reads':
        [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
        'mapped_reads_index': [
            dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
            for ofn in output_ofns
        ]
    }
Exemple #11
0
def map_reads_pbmm2(bam_files, pbi_files, genome_fastagz, genome_mmi,
                    pbbamify):
    # Unpack the docker image
    dx_utils.run_cmd("docker load -i /opt/pbmm2/pbmm2_docker.tar.gz")
    # Download inputs
    reads = [dx_utils.download_and_gunzip_file(f) for f in bam_files]
    pbis = [
        dx_utils.download_and_gunzip_file(f) for f in pbi_files
        if f is not None
    ]
    ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz)
    ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi)

    # make sure we have full paths of inputs
    reads = [os.path.join('/home/dnanexus', r) for r in reads]
    pbis = [os.path.join('/home/dnanexus', r) for r in pbis]
    ref_genome = os.path.join('/home/dnanexus', ref_genome)
    ref_genome_mmi = os.path.join('/home/dnanexus', ref_genome_mmi)

    # use environment to set the pbmm2 variable
    docker_run = [
        "docker",
        "run",
        # this mounts /home/dnanexus into the docker container and
        # sets it as the work dir
        "-v",
        "/home/dnanexus:/home/dnanexus",
        "-w",
        "/home/dnanexus",
        # this calls the container and the command we want
        "quay.io/biocontainers/pbmm2:1.0.0--ha888412_0"
    ]
    pbmm2_cmd = docker_run + ["pbmm2"]
    pbindex_cmd = docker_run + ["pbindex"]
    pbbamify_cmd = docker_run + ["pbbamify"]
    # run man page
    dx_utils.run_cmd(pbmm2_cmd + ['-h'])

    # Iterate over bam files
    output_ofns = []
    for bam in reads:
        prefix = re.sub("(\.subreads)?(\.bam){1}$", "", bam)
        ofn = '{0}.mapped.bam'.format(prefix)
        pb_ofn = '{0}.pb.mapped.bam'.format(prefix)
        s_pb_ofn = '{0}.pb.sorted.bam'.format(prefix)

        if bam + '.pbi' not in pbis:
            print('DNAnexus run pbindex')
            dx_utils.run_cmd(pbindex_cmd + [bam])
        else:
            print('DNAnexus not run pbindex')

        # compute memory per sorting thread
        system_memory = dx_utils.get_memory(suffix='G') - 40
        memory_per_thread = system_memory / SORT_THREADS

        # Call pbmm2 align
        pbmm2_cmd = pbmm2_cmd + [
            'align',
            str(ref_genome_mmi),
            str(bam),
            str(ofn), '-j',
            str(MAP_THREADS), '--sort', '-J',
            str(SORT_THREADS), '-m', '{0}G'.format(int(memory_per_thread)),
            '--log-level', 'DEBUG'
        ]
        dx_utils.run_cmd(pbmm2_cmd)
        if pbbamify:
            print('DNAnexus call pbbamify')
            pbbamify_cmd = pbbamify_cmd + [
                '--input=' + ofn, '--output=' + pb_ofn, ref_genome, bam
            ]
            dx_utils.run_cmd(pbbamify_cmd)
            dx_utils.run_cmd(['rm', ofn])
            # sort
            cmd = [
                'samtools', 'sort', '-o', s_pb_ofn, '-@',
                str(multiprocessing.cpu_count()), '-T', prefix, pb_ofn
            ]
            dx_utils.run_cmd(cmd)
            dx_utils.run_cmd(['rm', pb_ofn])
            # Create index
            cmd = ['samtools', 'index', s_pb_ofn]
            dx_utils.run_cmd(cmd)
            # append to outputs
            output_ofns.append(s_pb_ofn)

        else:
            # Create index
            cmd = ['samtools', 'index', ofn]
            dx_utils.run_cmd(cmd)
            # append to outputs
            output_ofns.append(ofn)

        # check what files are created
        dx_utils.run_cmd(['ls', '.'])

    if pbbamify:
        return {
            'mapped_reads':
            [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns],
            'mapped_reads_index': [
                dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai'))
                for ofn in output_ofns
            ]
        }
    else:
        return {
            'mapped_reads': [
                dxpy.dxlink(dxpy.upload_local_file(s_pb_ofn))
                for s_pb_ofn in output_ofns
            ],
            'mapped_reads_index': [
                dxpy.dxlink(dxpy.upload_local_file(s_pb_ofn + '.bai'))
                for s_pb_ofn in output_ofns
            ]
        }
def main(**job_inputs):
    bionano_cmap_link = job_inputs['refinefinal_merged_cmap']
    ngs_fasta_link = job_inputs['ngs_fasta_or_cmap']
    args_xml_link = job_inputs.get('args_xml')

    # Download all the inputs
    bionano_cmap_filename = dx_utils.download_and_gunzip_file(bionano_cmap_link)
    ngs_fasta_filename = dx_utils.download_and_gunzip_file(ngs_fasta_link)
    if args_xml_link:
        args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link)
    else:
        args_xml_filename = os.path.join(HYBRID_DIR, 'hybridScaffold_config.xml')

    output_dir = "hybrid_scaffold_output"

    scaffold_cmd = ["perl", os.path.join(HYBRID_DIR, "hybridScaffold.pl"), "-n", ngs_fasta_filename,
                    "-b", bionano_cmap_filename, "-o", output_dir, "-c", args_xml_filename,
                    "-r", os.path.join(TOOLS_DIR, 'RefAligner')]

    if "conflict_resolution_file" in job_inputs:
        conflict_resolution_file = dx_utils.download_and_gunzip_file(job_inputs["conflict_resolution_file"])
        scaffold_cmd += ["-M", conflict_resolution_file]
    else:
        scaffold_cmd += ["-B", str(job_inputs["b_conflict_filter"]), "-N", str(job_inputs["n_conflict_filter"])]

    molecules_bnx_file = None
    if job_inputs["generate_molecules"] is True:
        scaffold_cmd += ["-x", "-p", SCRIPTS_DIR]

        try:
            molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"])
            scaffold_cmd += ["-m", molecules_bnx_file]

        except KeyError:
            raise dxpy.AppError("Molecules BNX file required for Align Molecules flag (-x)")

        try:
            optargs_xml = dx_utils.download_and_gunzip_file(job_inputs["optargs_xml"])
            scaffold_cmd += ["-q", optargs_xml]

        except KeyError:
            raise dxpy.AppError("OptArgs XML file required for Align Molecules flag (-x)")

    if job_inputs["generate_chimeric"] is True:
        scaffold_cmd += ["-y"]

        if molecules_bnx_file:
            scaffold_cmd += ["-m", molecules_bnx_file]

        else:
            try:
                molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"])
                scaffold_cmd += ["-m", molecules_bnx_file]

            except KeyError:
                raise dxpy.AppError("Molecules BNX file required for Generate Molecules flag")

        if "err_files" in job_inputs:
            err_files = [dx_utils.download_and_gunzip_file(err_file) for err_file in job_inputs["err_files"]]
            for err in err_files:
                scaffold_cmd += ["-e", err]
    dx_utils.run_cmd(scaffold_cmd)

    dx_utils.run_cmd(["tree", output_dir])

    scaffold_final_ncbi = glob.glob(
        os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NCBI.fasta'))[0]
    unscaffolded_final = glob.glob(
        os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta'))[0]
    scaffold_final = glob.glob(
        os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.fasta'))
    scaffold_final.extend(glob.glob(
        os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.cmap')))
    scaffold_final.extend(glob.glob(
        os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.agp')))

    scaffold_output = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.xmap'))
    scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_q.cmap')))
    scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_r.cmap')))
    scaffold_output = [f for f in scaffold_output if f not in scaffold_final]

    cut_and_conflict = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', 'conflicts*.txt'))
    cut_and_conflict.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', '*_annotations.bed')))

    # make sure output files don't have colons
    dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi])
    dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final])

    # upload outputs
    output = {"scaffold_final": [dx_utils.gzip_and_upload(f) for f in scaffold_final],
            "scaffold_output": [dx_utils.gzip_and_upload(f) for f in scaffold_output],
            "cut_and_conflict": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in cut_and_conflict],
            "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi),
            "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final)}

    tar_name = "hybrid_scaffold_output.tar.gz"
    tar_cmd = "tar czvf {tar_name} {outdir}".format(
        tar_name=tar_name,
        outdir=output_dir)
    dx_utils.run_cmd(tar_cmd)
    output_id = dxpy.upload_local_file(tar_name)
    
    output["scaffold_targz"] = dxpy.dxlink(output_id)
    return output
Exemple #13
0
def main(**job_inputs):
    bionano_cmap_1_link = job_inputs['bng_enzyme1']
    bionano_cmap_2_link = job_inputs['bng_enzyme2']
    ngs_fasta_link = job_inputs['ngs_fasta_or_cmap']
    args_xml_link = job_inputs.get('args_xml')

    # Download all the inputs
    bionano_cmap_1_filename = os.path.join(
        '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_1_link))
    bionano_cmap_2_filename = os.path.join(
        '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_2_link))
    ngs_fasta_filename = os.path.join('/home/dnanexus', dx_utils.download_and_gunzip_file(ngs_fasta_link))

    if args_xml_link:
        args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link)
    else:
        args_xml_filename = os.path.join(HYBRID_DIR, 'TGH', 'hybridScaffold_two_enzymes.xml')
    output_dir = "hybrid_scaffold_output"

    dx_utils.run_cmd(['mkdir', output_dir])
    results_tar = output_dir + '_results.tar'

    cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "--help"]
    dx_utils.run_cmd(cmd)

    scaffold_cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "-N", ngs_fasta_filename,
                    "-b1", bionano_cmap_1_filename, "-b2", bionano_cmap_2_filename, "-O", output_dir,
                    "-R", os.path.join(TOOLS_DIR, 'RefAligner'), "-t", results_tar,
                    "-e1", job_inputs['enzyme1_name'], "-e2", job_inputs['enzyme2_name']]

    if job_inputs.get("cuts1_file") and job_inputs.get("cuts2_file"):
        cuts1_file = dx_utils.download_and_gunzip_file(job_inputs["cuts1_file"])
        cuts2_file = dx_utils.download_and_gunzip_file(job_inputs["cuts2_file"])
        scaffold_cmd += ["-m1", cuts1_file, "-m2", cuts2_file]

    scaffold_cmd += [args_xml_filename]
    dx_utils.run_cmd(scaffold_cmd)

    # try locating the outputs
    final_dirs = ["TGH_M2", "TGH_M1",  "two_enzyme_hybrid_scaffold_M2", "two_enzyme_hybrid_scaffold_M1"]
    for possible_loc in final_dirs:
        scaffold_final = glob.glob(os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export.fasta'))

        if scaffold_final:
            scaffold_final_ncbi = glob.glob(
                os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NCBI.fasta'))[0]
            unscaffolded_final = glob.glob(
                os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NOT_SCAFFOLDED.fasta'))[0]

            scaffold_output = glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.agp'))
            scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.xmap')))
            scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_q.cmap')))
            scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_r.cmap')))
            scaffold_output = [f for f in scaffold_output if f not in scaffold_final]
            break

    # if still not found, something went wrong
    if not scaffold_final:
        hybrid_scaffold_log = os.path.join(output_dir, 'TGH.log')
        dx_utils.run_cmd(["tail", "-n", "50", hybrid_scaffold_log])
        raise dxpy.AppError("ERROR: No hybrid scaffolds produced.")

    # make sure output files don't have colons
    dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi])
    dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final])

    output = {
        "scaffold_fasta": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_final if f.endswith(".fasta")],
        "scaffold_output": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_output],
        "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi),
        "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final)
        }
    
    tar_name = "hybrid_scaffold_output.tar.gz"
    tar_cmd = "tar czvf {tar_name} {outdir}".format(
        tar_name=tar_name,
        outdir=output_dir)
    dx_utils.run_cmd(tar_cmd)
    output_id = dxpy.upload_local_file(tar_name)

    output["scaffold_targz"] = dxpy.dxlink(output_id)

    return output