def main(**job_inputs): input_vcfs = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in job_inputs['input_vcfs'] ] input_ref = dx_utils.download_and_gunzip_file(job_inputs['ref_fasta']) # create index files for inputs dx_utils.run_cmd(['samtools', 'faidx', input_ref]) map(dx_utils.run_cmd, ['tabix {0}'.format(vcf) for vcf in input_vcfs]) with open(VCF_FOFN, 'w') as fh: fh.write('\n'.join(input_vcfs)) # get the bcftools version and help doc cmd = ['bcftools', '--help'] dx_utils.run_cmd(cmd) output_prefix = job_inputs.get('output_prefix', '') output_bcf = output_prefix + 'concat' + '.bcf' # concatenate the bcf/vcf files concat_cmd = ['bcftools', 'concat', '-f', VCF_FOFN] view_cmd = ['bcftools', 'view', '-Ou', '-e\'type="ref"\''] norm_cmd = [ 'bcftools', 'norm', '-Ob', '-f', input_ref, '-o', output_bcf, '--threads={0}'.format(multiprocessing.cpu_count()) ] # run the commands dx_utils.run_pipe(concat_cmd, view_cmd, norm_cmd) # index the concatenated bcf file dx_utils.run_cmd(['bcftools', 'index', output_bcf]) # call consensus output_fasta = output_prefix + 'consensus.fasta' consensus_filter = 'QUAL>1 && (GT="AA" || GT="Aa")' consensus_cmd = [ 'bcftools', 'consensus', '-i', consensus_filter, '-Hla', '-f', input_ref, output_bcf ] dx_utils.run_pipe(consensus_cmd, outputFile=output_fasta) # save the changes to vcf output_vcf = output_prefix + 'changes.vcf.gz' vcf_cmd = [ 'bcftools', 'view', '-i', consensus_filter, '-Oz', '--threads={0}'.format(multiprocessing.cpu_count()), output_bcf ] dx_utils.run_pipe(vcf_cmd, outputFile=output_vcf) consensus_link = dx_utils.gzip_and_upload(output_fasta) print(consensus_link) output = {} output['consensus_fasta'] = consensus_link output['consensus_vcf'] = dxpy.dxlink(dxpy.upload_local_file(output_vcf)) return output
def _run_meryl(output, sequences, k_mer_size,is10x): dx_utils.run_cmd(["mkdir","unuse"]) dx_utils.run_cmd(["mv", "/usr/src/canu", "/home/dnanexus/unuse"]) dx_utils.run_cmd(["chmod","777","meryl"]) dx_utils.run_cmd(["./meryl","--version"]) #dx_utils.run_cmd(["./meryl"]) mem_in_gb=dx_utils.run_cmd("head -n1 /proc/meminfo | awk '{print int($2*0.6/1024/1024)}'",returnOutput=True) print('mem in GB',mem_in_gb) dx_utils.run_cmd('ulimit -Sn 32000') meryl_kmer = ["./meryl", "threads={}".format(multiprocessing.cpu_count()), "k={}".format(k_mer_size),"memory={}".format(mem_in_gb)] for file_ref in sequences: dx_utils.download_and_gunzip_file(file_ref) each_file_name=dxpy.describe(file_ref)["name"].replace(".gz",'') print('processing {0}'.format(each_file_name)) each_file_prefix=each_file_name.replace('fastq','') each_file_prefix=each_file_prefix.replace('fq','') if 'R1' in each_file_name: if is10x: print('10x trimming') dx_utils.run_cmd( "cat "+ each_file_name +" | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}'| split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/"+each_file_prefix) else: print('no trimming') dx_utils.run_cmd( "cat {0} | split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/{1}".format(each_file_name, each_file_prefix)) else: print('no trimming') dx_utils.run_cmd( "cat {0} | split -a 4 -d -l 300000000 --additional-suffix='.fq' - split/{1}".format(each_file_name, each_file_prefix)) #meryl_kmer.extend([dxpy.describe(file_ref)["name"].replace(".gz",'')]) for split_file in os.listdir('/home/dnanexus/split'): split_file_prefix=split_file.replace('.fq','') command=meryl_kmer+["count","split/"+split_file,"output","split/"+split_file_prefix+'.meryl'] dx_utils.run_cmd(command) os.listdir('/home/dnanexus/split') folder_string = subprocess.check_output('ls -d split/*/',shell=True) folder_list = folder_string.strip().split('\n') folder_list = map(lambda x: x[:-1], folder_list) meryl_union_sum=["meryl","threads={}".format(multiprocessing.cpu_count()), "k={}".format(k_mer_size),"memory={}".format(mem_in_gb),"union-sum","output","output"] meryl_union_sum.extend(folder_list) dx_utils.run_cmd(meryl_union_sum) with open("mer_counts.tsv",'w') as kmers_index: subprocess.check_call(["./meryl", "histogram", "output"],shell=False,stdout=kmers_index) output["histogram"] = dxpy.dxlink(dxpy.upload_local_file("mer_counts.tsv"))
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype): # Download inputs reads = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in reads ] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # configure preset params if datatype == 'PacBio': preset_param = 'map-pb' else: preset_param = 'map-ont' # Iterate over reads files output_ofns = [] for read in reads: output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read) ofn = '{0}.mapped.bam'.format(output_prefix) # Get help info dx_utils.run_cmd(['minimap2', '-h']) # Call minimap2 minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read] view_cmd = [ 'sambamba', 'view', '--sam-input', '--format=bam', '--compression-level=0', '/dev/stdin' ] sort_cmd = [ 'sambamba', 'sort', '-m', '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn, '-t', str(multiprocessing.cpu_count()), '/dev/stdin' ] dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd) # index dx_utils.run_cmd(['sambamba', 'index', ofn]) # append to outputs output_ofns.append(ofn) return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] }
def run_minimap_index(genome_fastagz): ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ofn = os.path.splitext(ref_genome)[0] + '.mmi' minimap2_indx_cmd = ["minimap2", "-d", ofn, ref_genome] dx_utils.run_cmd(minimap2_indx_cmd) return {'genome_mmi': dxpy.dxlink(dxpy.upload_local_file(ofn))}
def run_bwa(reads, ref_genome): dx_utils.download_and_gunzip_file(ref_genome) bwt_filename = dx_utils.run_cmd('ls *.bwt', returnOutput=True).strip() genome_prefix = os.path.splitext(bwt_filename)[0] reads = dx_utils.download_and_gunzip_file(reads, skip_decompress=True, create_named_pipe=True) ofn = re.sub('.fastq[.gz]*$', '.bam', reads) cmd = 'set -e -o pipefail; bwa mem -t {0} -B 8 {1} {2} '.format( multiprocessing.cpu_count(), genome_prefix, reads) cmd += '| perl ./filter_five_end.pl | samtools view -@{0} -Sb - > {1} '.format( multiprocessing.cpu_count(), ofn) dx_utils.run_cmd(cmd) return {'output_bam': dxpy.dxlink(dxpy.upload_local_file(ofn))}
def combine_bams(fwd_bam, rev_bam): fwd_bam = dx_utils.download_and_gunzip_file(fwd_bam, create_named_pipe=True) rev_bam = dx_utils.download_and_gunzip_file(rev_bam, create_named_pipe=True) ofn = re.sub('.bam$', '.combined.bam', fwd_bam) #cmd = 'perl two_read_bam_combiner.pl {0} {1} | samtools view -@{2} -Sb - | samtools sort -o {3} - ' cmd = 'set -e -o pipefail; perl two_read_bam_combiner.pl {0} {1} | /opt/biobambam2/bin/bamsormadup inputformat=sam indexfilename={2}.bai > {2} ' cmd = cmd.format(fwd_bam, rev_bam, ofn) dx_utils.run_cmd(cmd) return { 'output_bam': dxpy.dxlink(dxpy.upload_local_file(ofn, name=fwd_bam)), 'output_bai': dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai', name=fwd_bam + '.bai')) }
def run_minimap_index(genome_fastagz): # load the docker images dx_utils.run_cmd(['docker', 'load', '-i', '/opt/minimap2_images.tar']) # download the reference genome ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ofn = os.path.splitext(ref_genome)[0] + '.mmi' # run minimap2 index docker_cmd = [ 'docker', 'run', '-v', '/home/dnanexus:/home/dnanexus', '-w', '/home/dnanexus' ] minimap2_indx_cmd = docker_cmd + [ 'quay.io/biocontainers/minimap2:2.17--h84994c4_0' ] minimap2_indx_cmd += ["minimap2", "-d", ofn, ref_genome] dx_utils.run_cmd(minimap2_indx_cmd) return {'genome_mmi': dxpy.dxlink(dxpy.upload_local_file(ofn))}
def main(input_assembly, hic_alignments, restriction_enzyme_bases, filter_alignments, input_assembly_graph=None): # make sure we can run salsa dx_utils.run_cmd("python /opt/SALSA/run_pipeline.py -h") input_assembly = dx_utils.download_and_gunzip_file(input_assembly) alignment_prefix = input_assembly.split(".fasta")[0] # process inputs and convert to bed first_file = True for bam_file in hic_alignments: fn = dxpy.describe(bam_file['$dnanexus_link'])['name'] cmd = 'dx cat {0}'.format(bam_file['$dnanexus_link']) prefix, suffix = os.path.splitext(fn) if suffix == '.gz': cmd += '| gunzip ' fn = prefix cmd += '| bedtools bamtobed -i stdin' if first_file: cmd += ' > {0}.bed'.format(alignment_prefix) first_file = False else: cmd += ' >> {0}.bed'.format(alignment_prefix) dx_utils.run_cmd(cmd) # index the ref cmd = 'samtools faidx {0} '.format(input_assembly) dx_utils.run_cmd(cmd) # if we were asked to filter by contig names, make a bed file and subset the input bed if filter_alignments == True: f = open('%s.fai' % (input_assembly)) o = open("%s.contigs.bed" % (alignment_prefix), 'w') for line in f: line = line.strip().split() o.write("%s\t1\t%s\n" % (line[0], line[1])) f.close() o.close() cmd = 'bedtools intersect -wa -a {0}.bed -b {0}.contigs.bed > {0}.filtered.bed'.format( alignment_prefix) dx_utils.run_cmd(cmd) else: cmd = 'ln -s {0}.bed {0}.filtered.bed'.format(alignment_prefix) dx_utils.run_cmd(cmd) # now sort the bed file cmd = "sort -T . -k4 {0}.filtered.bed > {0}.sorted.bed".format( alignment_prefix) dx_utils.run_cmd(cmd) cmd = 'python /opt/SALSA/run_pipeline.py -a {0} -b {1}.sorted.bed -l {0}.fai -o {2} -e {3} -m yes -p yes ' cmd = cmd.format(input_assembly, alignment_prefix, './', ','.join(restriction_enzyme_bases)) if input_assembly_graph is not None: cmd = "%s -g input_assembly_graph" % (cmd) dx_utils.run_cmd(cmd) output = {} # final scaffold final_fasta = glob.glob('scaffold*FINAL.fasta')[0] output['final_scaffold_fasta'] = dx_utils.gzip_and_upload(final_fasta) # final agp final_agp = glob.glob('scaffold*FINAL.agp')[0] output['final_scaffold_agp'] = dx_utils.gzip_and_upload(final_agp) # alignment_iteration_1 bed alignment_iteration_1 = glob.glob('alignment_iteration_1.bed')[0] output['alignment_iteration_1'] = dx_utils.gzip_and_upload( alignment_iteration_1) # scaffold_length_iteration_1 scaffold_length_iteration_1 = glob.glob('scaffold_length_iteration_1')[0] output['scaffold_length_iteration_1'] = dx_utils.gzip_and_upload( scaffold_length_iteration_1) # all others files = glob.glob('scaffold*fasta') files.extend(glob.glob('scaffold*agp')) if final_fasta in files: files.remove(final_fasta) if final_agp in files: files.remove(final_agp) print files output['scaffold'] = dx_utils.tar_files_and_upload(files, alignment_prefix) return output
def map_reads_pbmm2(bam_files, pbi_files, genome_fastagz, genome_mmi): # Download inputs reads = [dx_utils.download_and_gunzip_file(f) for f in bam_files] pbis = [dx_utils.download_and_gunzip_file(f) for f in pbi_files] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # find out what environment we're dealing with print(os.environ) # use environment to set path pbmm2_env = { "PATH": os.environ['PATH'] + os.pathsep + '/anaconda/bin/', "SHELL": '/bin/bash', "USER": '******' } # create bam dataset with dx_utils.set_env(**pbmm2_env): # Iterate over bam files output_ofns = [] for bam in reads: prefix = re.sub("(\.subreads)?(\.bam){1}$", "", bam) ofn = '{0}.mapped.bam'.format(prefix) if bam + '.pbi' not in pbis: dx_utils.run_cmd('pbindex {0}'.format(bam)) dx_utils.run_cmd(['pbmm2', 'align', '--help']) # Call minimap2 # compute memory per sorting thread system_memory = dx_utils.get_memory(suffix='G') - 40 memory_per_thread = system_memory / SORT_THREADS pbmm2_cmd = [ '/anaconda/bin/pbmm2', 'align', str(ref_genome_mmi), str(bam), str(ofn), '-j', str(MAP_THREADS), '--sort', '-J', str(SORT_THREADS), '-m', '{0}G'.format(int(memory_per_thread)), '--log-level', 'DEBUG' ] # there's some env variable that is causing pbmm2 to misbehave # when run from a python shell. Therefore, write the command to a # temp .sh file and execute it that way. tmp_cmd = tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) tmp_cmd.write(' '.join(pbmm2_cmd)) tmp_cmd.close() print('Executing: {0}'.format(' '.join(pbmm2_cmd))) dx_utils.run_cmd(['sudo', 'bash', tmp_cmd.name]) # Create index cmd = ['samtools', 'index', ofn] dx_utils.run_cmd(cmd) # append to outputs output_ofns.append(ofn) return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] }
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype): # load the docker images dx_utils.run_cmd(['docker', 'load', '-i', '/opt/minimap2_images.tar']) # Download inputs reads = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in reads ] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # configure preset params if datatype == 'PacBio': preset_param = 'map-pb' elif datatype == 'CCS': preset_param = 'asm20' else: preset_param = 'map-ont' docker_cmd = [ 'docker', 'run', '-v', '/home/dnanexus:/home/dnanexus', '-w', '/home/dnanexus' ] minimap2_docker_cmd = docker_cmd + [ 'quay.io/biocontainers/minimap2:2.17--h84994c4_0' ] # Run sambamba with -i flag, meaning it can read from stdin sambamba_docker_cmd = docker_cmd + [ '-i', 'quay.io/biocontainers/sambamba:0.6.8--h682856c_1' ] # Iterate over reads files output_ofns = [] for read in reads: output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read) ofn = '{0}.mapped.bam'.format(output_prefix) # Call minimap2 minimap2_cmd = minimap2_docker_cmd + [ 'minimap2', '-ax', preset_param, ref_genome_mmi, read ] view_cmd = sambamba_docker_cmd + [ 'sambamba', 'view', '--sam-input', '--format=bam', '--compression-level=0', '/dev/stdin' ] sort_cmd = sambamba_docker_cmd + [ 'sambamba', 'sort', '-m', '{0}G'.format( int(dx_utils.get_memory(suffix='G'))), '-o', ofn, '-t', str(multiprocessing.cpu_count()), '/dev/stdin' ] dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd) # index dx_utils.run_cmd(sambamba_docker_cmd + ['sambamba', 'index', ofn]) # append to outputs output_ofns.append(ofn) return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] }
def map_reads_pbmm2(bam_files, pbi_files, genome_fastagz, genome_mmi, pbbamify): # Unpack the docker image dx_utils.run_cmd("docker load -i /opt/pbmm2/pbmm2_docker.tar.gz") # Download inputs reads = [dx_utils.download_and_gunzip_file(f) for f in bam_files] pbis = [ dx_utils.download_and_gunzip_file(f) for f in pbi_files if f is not None ] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # make sure we have full paths of inputs reads = [os.path.join('/home/dnanexus', r) for r in reads] pbis = [os.path.join('/home/dnanexus', r) for r in pbis] ref_genome = os.path.join('/home/dnanexus', ref_genome) ref_genome_mmi = os.path.join('/home/dnanexus', ref_genome_mmi) # use environment to set the pbmm2 variable docker_run = [ "docker", "run", # this mounts /home/dnanexus into the docker container and # sets it as the work dir "-v", "/home/dnanexus:/home/dnanexus", "-w", "/home/dnanexus", # this calls the container and the command we want "quay.io/biocontainers/pbmm2:1.0.0--ha888412_0" ] pbmm2_cmd = docker_run + ["pbmm2"] pbindex_cmd = docker_run + ["pbindex"] pbbamify_cmd = docker_run + ["pbbamify"] # run man page dx_utils.run_cmd(pbmm2_cmd + ['-h']) # Iterate over bam files output_ofns = [] for bam in reads: prefix = re.sub("(\.subreads)?(\.bam){1}$", "", bam) ofn = '{0}.mapped.bam'.format(prefix) pb_ofn = '{0}.pb.mapped.bam'.format(prefix) s_pb_ofn = '{0}.pb.sorted.bam'.format(prefix) if bam + '.pbi' not in pbis: print('DNAnexus run pbindex') dx_utils.run_cmd(pbindex_cmd + [bam]) else: print('DNAnexus not run pbindex') # compute memory per sorting thread system_memory = dx_utils.get_memory(suffix='G') - 40 memory_per_thread = system_memory / SORT_THREADS # Call pbmm2 align pbmm2_cmd = pbmm2_cmd + [ 'align', str(ref_genome_mmi), str(bam), str(ofn), '-j', str(MAP_THREADS), '--sort', '-J', str(SORT_THREADS), '-m', '{0}G'.format(int(memory_per_thread)), '--log-level', 'DEBUG' ] dx_utils.run_cmd(pbmm2_cmd) if pbbamify: print('DNAnexus call pbbamify') pbbamify_cmd = pbbamify_cmd + [ '--input=' + ofn, '--output=' + pb_ofn, ref_genome, bam ] dx_utils.run_cmd(pbbamify_cmd) dx_utils.run_cmd(['rm', ofn]) # sort cmd = [ 'samtools', 'sort', '-o', s_pb_ofn, '-@', str(multiprocessing.cpu_count()), '-T', prefix, pb_ofn ] dx_utils.run_cmd(cmd) dx_utils.run_cmd(['rm', pb_ofn]) # Create index cmd = ['samtools', 'index', s_pb_ofn] dx_utils.run_cmd(cmd) # append to outputs output_ofns.append(s_pb_ofn) else: # Create index cmd = ['samtools', 'index', ofn] dx_utils.run_cmd(cmd) # append to outputs output_ofns.append(ofn) # check what files are created dx_utils.run_cmd(['ls', '.']) if pbbamify: return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] } else: return { 'mapped_reads': [ dxpy.dxlink(dxpy.upload_local_file(s_pb_ofn)) for s_pb_ofn in output_ofns ], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(s_pb_ofn + '.bai')) for s_pb_ofn in output_ofns ] }
def main(**job_inputs): bionano_cmap_link = job_inputs['refinefinal_merged_cmap'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_filename = dx_utils.download_and_gunzip_file(bionano_cmap_link) ngs_fasta_filename = dx_utils.download_and_gunzip_file(ngs_fasta_link) if args_xml_link: args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'hybridScaffold_config.xml') output_dir = "hybrid_scaffold_output" scaffold_cmd = ["perl", os.path.join(HYBRID_DIR, "hybridScaffold.pl"), "-n", ngs_fasta_filename, "-b", bionano_cmap_filename, "-o", output_dir, "-c", args_xml_filename, "-r", os.path.join(TOOLS_DIR, 'RefAligner')] if "conflict_resolution_file" in job_inputs: conflict_resolution_file = dx_utils.download_and_gunzip_file(job_inputs["conflict_resolution_file"]) scaffold_cmd += ["-M", conflict_resolution_file] else: scaffold_cmd += ["-B", str(job_inputs["b_conflict_filter"]), "-N", str(job_inputs["n_conflict_filter"])] molecules_bnx_file = None if job_inputs["generate_molecules"] is True: scaffold_cmd += ["-x", "-p", SCRIPTS_DIR] try: molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += ["-m", molecules_bnx_file] except KeyError: raise dxpy.AppError("Molecules BNX file required for Align Molecules flag (-x)") try: optargs_xml = dx_utils.download_and_gunzip_file(job_inputs["optargs_xml"]) scaffold_cmd += ["-q", optargs_xml] except KeyError: raise dxpy.AppError("OptArgs XML file required for Align Molecules flag (-x)") if job_inputs["generate_chimeric"] is True: scaffold_cmd += ["-y"] if molecules_bnx_file: scaffold_cmd += ["-m", molecules_bnx_file] else: try: molecules_bnx_file = dx_utils.download_and_gunzip_file(job_inputs["molecules_bnx_file"]) scaffold_cmd += ["-m", molecules_bnx_file] except KeyError: raise dxpy.AppError("Molecules BNX file required for Generate Molecules flag") if "err_files" in job_inputs: err_files = [dx_utils.download_and_gunzip_file(err_file) for err_file in job_inputs["err_files"]] for err in err_files: scaffold_cmd += ["-e", err] dx_utils.run_cmd(scaffold_cmd) dx_utils.run_cmd(["tree", output_dir]) scaffold_final_ncbi = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NCBI.fasta'))[0] unscaffolded_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta'))[0] scaffold_final = glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.fasta')) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.cmap'))) scaffold_final.extend(glob.glob( os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.agp'))) scaffold_output = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD.xmap')) scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_q.cmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds', '*_HYBRID_SCAFFOLD_r.cmap'))) scaffold_output = [f for f in scaffold_output if f not in scaffold_final] cut_and_conflict = glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', 'conflicts*.txt')) cut_and_conflict.extend(glob.glob(os.path.join(output_dir, 'hybrid_scaffolds*', '*_annotations.bed'))) # make sure output files don't have colons dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi]) dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final]) # upload outputs output = {"scaffold_final": [dx_utils.gzip_and_upload(f) for f in scaffold_final], "scaffold_output": [dx_utils.gzip_and_upload(f) for f in scaffold_output], "cut_and_conflict": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in cut_and_conflict], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final)} tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format( tar_name=tar_name, outdir=output_dir) dx_utils.run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output
def main(**job_inputs): bionano_cmap_1_link = job_inputs['bng_enzyme1'] bionano_cmap_2_link = job_inputs['bng_enzyme2'] ngs_fasta_link = job_inputs['ngs_fasta_or_cmap'] args_xml_link = job_inputs.get('args_xml') # Download all the inputs bionano_cmap_1_filename = os.path.join( '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_1_link)) bionano_cmap_2_filename = os.path.join( '/home/dnanexus', dx_utils.download_and_gunzip_file(bionano_cmap_2_link)) ngs_fasta_filename = os.path.join('/home/dnanexus', dx_utils.download_and_gunzip_file(ngs_fasta_link)) if args_xml_link: args_xml_filename = dx_utils.download_and_gunzip_file(args_xml_link) else: args_xml_filename = os.path.join(HYBRID_DIR, 'TGH', 'hybridScaffold_two_enzymes.xml') output_dir = "hybrid_scaffold_output" dx_utils.run_cmd(['mkdir', output_dir]) results_tar = output_dir + '_results.tar' cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "--help"] dx_utils.run_cmd(cmd) scaffold_cmd = ["Rscript", os.path.join(HYBRID_DIR, "runTGH.R"), "-N", ngs_fasta_filename, "-b1", bionano_cmap_1_filename, "-b2", bionano_cmap_2_filename, "-O", output_dir, "-R", os.path.join(TOOLS_DIR, 'RefAligner'), "-t", results_tar, "-e1", job_inputs['enzyme1_name'], "-e2", job_inputs['enzyme2_name']] if job_inputs.get("cuts1_file") and job_inputs.get("cuts2_file"): cuts1_file = dx_utils.download_and_gunzip_file(job_inputs["cuts1_file"]) cuts2_file = dx_utils.download_and_gunzip_file(job_inputs["cuts2_file"]) scaffold_cmd += ["-m1", cuts1_file, "-m2", cuts2_file] scaffold_cmd += [args_xml_filename] dx_utils.run_cmd(scaffold_cmd) # try locating the outputs final_dirs = ["TGH_M2", "TGH_M1", "two_enzyme_hybrid_scaffold_M2", "two_enzyme_hybrid_scaffold_M1"] for possible_loc in final_dirs: scaffold_final = glob.glob(os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export.fasta')) if scaffold_final: scaffold_final_ncbi = glob.glob( os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NCBI.fasta'))[0] unscaffolded_final = glob.glob( os.path.join(output_dir, possible_loc, 'AGPExport', '*HYBRID_Export_NOT_SCAFFOLDED.fasta'))[0] scaffold_output = glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.agp')) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export.xmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_q.cmap'))) scaffold_output.extend(glob.glob(os.path.join(output_dir, possible_loc, '*_HYBRID_Export_r.cmap'))) scaffold_output = [f for f in scaffold_output if f not in scaffold_final] break # if still not found, something went wrong if not scaffold_final: hybrid_scaffold_log = os.path.join(output_dir, 'TGH.log') dx_utils.run_cmd(["tail", "-n", "50", hybrid_scaffold_log]) raise dxpy.AppError("ERROR: No hybrid scaffolds produced.") # make sure output files don't have colons dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", scaffold_final_ncbi]) dx_utils.run_cmd(["sed", "-i.bak", "s/:/_/g", unscaffolded_final]) output = { "scaffold_fasta": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_final if f.endswith(".fasta")], "scaffold_output": [dxpy.dxlink(dxpy.upload_local_file(f)) for f in scaffold_output], "ncbi_scaffold_final": dx_utils.gzip_and_upload(scaffold_final_ncbi), "unscaffolded_final": dx_utils.gzip_and_upload(unscaffolded_final) } tar_name = "hybrid_scaffold_output.tar.gz" tar_cmd = "tar czvf {tar_name} {outdir}".format( tar_name=tar_name, outdir=output_dir) dx_utils.run_cmd(tar_cmd) output_id = dxpy.upload_local_file(tar_name) output["scaffold_targz"] = dxpy.dxlink(output_id) return output