def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] for bam_filename in bam_filenames: x = count_duplicates, (bam_filename,), {} jobs.append(x) results = parallel.pyfun(jobs, num_procs=num_cores) metadata["num_cores"] = num_cores assert len(results) == len(bam_filenames) handle = open(outfile, 'w') header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated" print >>handle, "\t".join(header) for i in range(len(bam_filenames)): x, sample, x = mlib.splitpath(bam_filenames[i]) total_reads, dup_reads = results[i] perc_dup = float(dup_reads) / total_reads * 100 perc_dup = "%.2f" % perc_dup x = sample, dup_reads, total_reads, perc_dup print >>handle, "\t".join(map(str, x)) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bam_path = in_data.identifier assert os.path.exists(bam_path) assert os.path.isdir(bam_path) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # Find all the BAM files. bam_filenames = filelib.list_files_in_path( bam_path, endswith=".bam", case_insensitive=True) jobs = [] # list of in_filename, out_filename for in_filename in bam_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) assert not os.path.exists(out_filename) x = in_filename, out_filename jobs.append(x) # Symlink the BAM files to the output path. for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Index each of the files. sq = parallel.quote samtools = filelib.which_assert(config.samtools) commands = [] for x in jobs: in_filename, out_filename = x cmd = [ sq(samtools), "index", sq(out_filename), ] x = " ".join(cmd) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores, path=out_path) # TODO: Check for output files. return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = in_filename, err_filename, out_filename jobs.append(x) # samtools mpileup -f [reference sequence] [BAM file(s)] # > myData.mpileup samtools = mlib.findbin("samtools") sq = mlib.sq commands = [] for x in jobs: in_filename, err_filename, out_filename = x x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] x.append(sq(in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, err_filename, out_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib #from genomicode import hashlib from Betsy import module_utils in_filenames = module_utils.find_bam_files(in_data.identifier) assert in_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] #seen = {} for i, in_filename in enumerate(in_filenames): p, f = os.path.split(in_filename) temp_prefix = "temp_%s" % f #temp_prefix = "temp_%s" % hashlib.hash_var(f) # Make sure no duplicates. #assert temp_prefix not in seen #seen[temp_prefix] = 1 #temp_outfilename = "%d.bam" % i out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, temp_prefix=temp_prefix, #temp_outfilename=temp_outfilename, out_filename=out_filename) jobs.append(x) samtools = filelib.which_assert(config.samtools) # Calculate the number of threads per process. nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores) num_threads = max(nc / len(jobs), 1) # Make a list of samtools commands. # Without -m, takes ~1 Gb per process. sq = parallel.quote commands = [] for j in jobs: # Usage has changed. Below no longer valid. # samtools sort <in_filename> <out_filestem> # .bam automatically added to <out_filestem>, so don't # need it. #x = out_filename #assert x.endswith(".bam") #x = x[:-4] #out_filestem = x x = [ sq(samtools), "sort", "-O", "bam", "-T", sq(j.temp_prefix), "-m", "4G", # Crashing, so try increasing memory. sq(j.in_filename), #"-o", sq(j.temp_outfilename), "-o", sq(j.out_filename), ] if num_threads > 1: x += ["-@", num_threads] x = " ".join(map(str, x)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = nc parallel.pshell(commands, max_procs=nc) #for cmd in commands: # parallel.sshell(cmd) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_outfilename, j.out_filename) # Make sure the analysis completed successfully. x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def main(): import os import argparse import itertools from genomicode import filelib from genomicode import config from genomicode import parallel from genomicode import alignlib parser = argparse.ArgumentParser(description="") parser.add_argument("reference_genome", help="fasta file") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument( "--dry_run", action="store_true", help="Just display the commands, and don't generate the alignment.") parser.add_argument("--window", default=80, type=int, help="Number of bases in alignment. Default: 80") group = parser.add_argument_group(title="Input") group.add_argument("--bam_file", help="Indexed BAM file.") group.add_argument("--bam_path", help="Path to BAM files.") group.add_argument( "--position", action="append", default=[], help="Specify a position to view, " "e.g. chr20:45,927,663 or chr20:45927663. 1-based coordinates") group.add_argument("--position_file", help="Tab-delimited text file with two columns. " "Column 1 is chromosome, column 2 is position.") group = parser.add_argument_group(title="Output") group.add_argument("--prefix", help="Pre-pend a prefix to each outfile.") group.add_argument( "--outpath", help="If multiple alignments are generated, this option " "directs where to save the output files.") group.add_argument( "--noclobber", action="store_true", help="If an output file already exists, don't overwrite it.") # Parse the input arguments. args = parser.parse_args() filelib.assert_exists_nz(args.reference_genome) assert args.bam_file or args.bam_path, \ "Either --bam_file or --bam_path must be provided." assert not (args.bam_file and args.bam_path), \ "Cannot specify both --bam_file or --bam_path." if args.bam_file: filelib.assert_exists_nz(args.bam_file) if args.bam_path: assert os.path.exists(args.bam_path) if args.position_file: filelib.assert_exists_nz(args.position_file) if args.outpath and not os.path.exists(args.outpath): os.mkdir(args.outpath) if args.num_procs < 1 or args.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") assert args.window >= 1 and args.window < 500 bam_filenames = [] if args.bam_file: bam_filenames.append(args.bam_file) else: x = os.listdir(args.bam_path) x = [x for x in x if x.endswith(".bam")] x = [os.path.join(args.bam_path, x) for x in x] bam_filenames = x assert bam_filenames, "No bam files found." positions = [] # list of (chrom, pos) for x in args.position: chrom, pos = _parse_position(x) positions.append((chrom, pos)) if args.position_file and os.path.exists(args.position_file): for cols in filelib.read_cols(args.position_file): assert len(cols) == 2, "Position file should have 2 columns" chrom, pos = cols pos = int(pos) assert pos >= 1 positions.append((chrom, pos)) assert positions, "No positions specified." # Make the commands. assert hasattr(config, "samtools") filelib.assert_exists(config.samtools) # Make sure we have the right version of samtools. # 1.2 (using htslib 1.2.1) # 0.1.18 (r982:295) version = alignlib.get_samtools_version() x = version.split(".") assert len(x) >= 2 major = x[0] assert major in ["0", "1"], "Unknown samtools version: %s" % version major = int(major) assert major >= 1, "Requires samtools >= 1 (Current version: %s)" % version commands = [] for x in itertools.product(bam_filenames, positions): bam_filename, (chrom, pos) = x p, f = os.path.split(bam_filename) sample, e = os.path.splitext(f) left = max(pos - args.window / 2, 1) pos_str = "%s:%s" % (chrom, left) x = "%2s.%9s.%s.html" % (chrom, pos, sample) if args.prefix: x = "%s.%s" % (args.prefix, x) if args.outpath: x = os.path.join(args.outpath, x) out_filename = x if args.noclobber and os.path.exists(out_filename): continue # samtools tview -d t -p 7:100550778 bam01/196B-lung.bam $FA sq = parallel.quote x = [ sq(config.samtools), "tview", "-d", "h", "-p", pos_str, sq(bam_filename), sq(args.reference_genome), ] x = " ".join(x) x = "%s >& %s" % (x, sq(out_filename)) commands.append(x) if args.dry_run: for x in commands: print x return parallel.pshell(commands, max_procs=args.num_procs)