def miraligner(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ hairpin, mirna = _download_mirbase(args) precursors = _read_precursor(args.hairpin, args.sps) matures = _read_mature(args.mirna, args.sps) gtf = _read_gtf(args.gtf) out_dts = [] for bam_fn in args.files: sample = op.splitext(op.basename(bam_fn))[0] if bam_fn.endswith("bam") or bam_fn.endswith("sam"): logger.info("Reading %s" % bam_fn) bam_fn = _sam_to_bam(bam_fn) bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort" pysam.sort("-n", bam_fn, bam_sort_by_n) reads = _read_bam(bam_sort_by_n + ".bam", precursors) elif bam_fn.endswith("fasta") or bam_fn.endswith("fa") or bam_fn.endswith("fastq"): out_file = op.join(args.out, sample + ".premirna") bam_fn = _filter_seqs(bam_fn) if args.miraligner: _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin) reads = _read_miraligner(out_file) else: if bam_fn.endswith("fastq"): bam_fn = _convert_to_fasta(bam_fn) logger.info("Aligning %s" % bam_fn) if not file_exists(out_file): pyMatch.Miraligner(hairpin, bam_fn, out_file, 1, 4) reads = _read_pyMatch(out_file, precursors) else: raise ValueError("Format not recognized.") if not args.miraligner: reads = _annotate(reads, matures, precursors) out_file = op.join(args.out, sample + ".mirna") out_file, dt, dt_pre= _tab_output(reads, out_file, sample) try: vcf_file = op.join(args.out, sample + ".vcf") if not file_exists(vcf_file): # if True: create_vcf(dt_pre, matures, gtf, vcf_file) try: import vcf vcf.Reader(filename=vcf_file) except Exception as e: logger.warning(e.__doc__) logger.warning(e.message) except Exception as e: # traceback.print_exc() logger.warning(e.__doc__) logger.warning(e.message) if isinstance(dt, pd.DataFrame): out_dts.append(dt) if out_dts: _create_counts(out_dts, args.out) # _summarize(out_dts) else: print "No files analyzed!"
def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None): logging.debug('LastzRunner: running on reference %s and query %s' % (reference_fasta_path, query_fasta_path)) output_sam_path = os.path.abspath( os.path.expandvars(output_bam_path.replace('.bam', '.sam'))) output_bam_unsorted_path = os.path.abspath( os.path.expandvars(output_bam_path + '.unsorted')) logging.debug( 'LastzRunner: aligning with output in temporary sam file %s' % output_sam_path) with open(output_sam_path, 'w') as output_sam_handler: for line in self._align(reference_fasta_path, query_fasta_path, multiple): output_sam_handler.write(line) logging.debug( 'LastzRunner: transforming sam into unsorted bam file %s' % output_bam_unsorted_path) input_sam_handler = pysam.Samfile(output_sam_path, "r") output_bam_file = pysam.Samfile( output_bam_unsorted_path, "wb", template=input_sam_handler) logging.debug( 'LastzRunner: copying from sam file to bam file') for s in input_sam_handler: output_bam_file.write(s) output_bam_file.close() logging.debug('LastzRunner: sorting and indexing bam file %s' % output_bam_path) pysam.sort(output_bam_unsorted_path, output_bam_path.replace('.bam', '')) pysam.index(output_bam_path)
def convertSortAlign(output_filename): # Pregenerate file names for all the intermediate steps (output_filename is the output of the Bowtie2 alignment) # Note that the file extension is not always given depending on the input conventions of the tool being called sam_filename=output_filename+'.sam' bam_filename=output_filename+'.bam' sorted_filename_input=output_filename+'_sorted' sorted_filename_output=output_filename+'_sorted.bam' # convert sam to bam print 'Converting {0} to {1} . . .'.format(sam_filename,bam_filename) try: SamtoBam(sam_filename,bam_filename) except Exception as ex: print "Error converting sam to bam ({0}): {1}".format(ex.errno, ex.strerror) return False # sort print 'Sorting {0} -> {1}'.format(bam_filename,sorted_filename_output) try: pysam.sort(bam_filename,sorted_filename_input) except Exception as ex: print "Error sorting bam file ({0}): {1}".format(ex.errno, ex.strerror) return False # index print 'Indexing {0} . . .'.format(sorted_filename_output) try: pysam.index(sorted_filename_output) except Exception as ex: print "Error indexing bam file ({0}): {1}".format(ex.errno, ex.strerror) return False print print 'Done' return True
def sort_by_position(bam_file, dir): ## get the file prefix prefix = "" prefix_match = re.match(r"(.*).bam", bam_file) try: prefix = prefix_match.group(1) except: print "Existing: Invalid bam file -i %s" %(bam_file) sys.exit(2) # sort the bam file bam_input = dir + bam_file sort_bam = dir + prefix + "_sorted" pysam.sort(bam_input, sort_bam) sort_bam = sort_bam + ".bam" # index the sort bam file pysam.index(sort_bam) print "" print "Writing Sorted Bam File : %s" %(sort_bam) print "Writing Index Sorted Bam File : %s.bai" %(sort_bam) return sort_bam
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path, args): work_dir = tempfile.mkdtemp( ) genome_db = os.path.join( work_dir, "genome" ) pe1_output = os.path.join( work_dir, "pe1.sai" ) pe2_output = os.path.join( work_dir, "pe2.sai" ) bwa_output = os.path.join( work_dir, "output.sam" ) null = open( "/dev/null" ) #open("/tmp/bwa_out")# subprocess.check_call( [ "bwa", "index", "-p", genome_db, genome_path ], stderr = null ) with open( pe1_output, "w" ) as pe1_file: subprocess.check_call( [ "bwa", "aln", genome_db, pe1_path ], stdout = pe1_file, stderr = null ) with open( pe2_output, "w" ) as pe2_file: subprocess.check_call( [ "bwa", "aln", genome_db, pe2_path ], stdout = pe2_file, stderr = null ) with open( bwa_output, "w" ) as bwa_file: subprocess.check_call( [ "bwa", "sampe", "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout = bwa_file, stderr = null ) if args.sam: shutil.move(bwa_output ,output_path+'.sam') #os.rename(bwa_output ,output_path+'.sam') else: sam_to_bam( bwa_output, bwa_output + ".bam" ) if args.sort: # coordinate sort the file pysam.sort( bwa_output + ".bam", output_path ) pysam.index(output_path+'.bam') else: shutil.move(bwa_output +".bam",output_path+'.bam')
def disciple(bam_fname, bam_hdr, rg_id, long_qname_table, cigar_v2, in_queue): """Create a BAM file from the FASTQ lines fed to it via in_queue :param bam_fname: :param bam_hdr: :param rg_id: :param long_qname_table: :param cigar_v2: :param in_queue: :return: """ logger.debug('Writing to {} ...'.format(bam_fname)) t0 = time.time() fp = pysam.AlignmentFile(bam_fname, 'wb', header=bam_hdr) ref_dict = {k['SN']: n for n, k in enumerate(bam_hdr['SQ'])} cnt = 0 for cnt, (qname, read_data) in enumerate(iter(in_queue.get, __process_stop_code__)): write_perfect_reads(qname, rg_id, long_qname_table, ref_dict, read_data, cigar_v2, fp) fp.close() t1 = time.time() logger.debug('... {}: {} reads in {:0.2f}s ({:0.2f} t/s)'.format(bam_fname, cnt, t1 - t0, cnt/(t1 - t0))) logger.debug('Sorting {} -> {}'.format(bam_fname, bam_fname + '.sorted')) t0 = time.time() pysam.sort('-m', '1G', '-o', bam_fname + '.sorted', bam_fname) os.remove(bam_fname) t1 = time.time() logger.debug('... {:0.2f}s'.format(t1 - t0)) logger.debug('Shutting down thread for {}'.format(bam_fname))
def sort_output(outPrefix): '''Sorts the output file by read coordinate''' pysam.sort(outPrefix+'.originalSort.bam', outPrefix + '.coordSort') #os.remove(outPrefix+'.originalSort.tmp.bam') ## Build the bam index for output pysam.index(outPrefix + '.coordSort.bam')
def makeAggregate(cells, directory, suffix, output): """ Create aggregate sample. Make an aggregate bam file from a list of cells, sorts and indexes the file for easy use in IGV. Suffix is required to prevent non 0-padded numbers matching the wrong files. Return final file name. Parameters ---------- cells : list List of cell names to create aggregate from. directory : string Directory path with the bam files from each cell. suffix : string String to match the end of the bam file, use to add file extension and to anchor the extension after file numbers - this will prevent cell_4 matching cell_4*. output : string String containing output file location. """ from glob import glob cells = set(cells) fileList = [] for cell in cells: fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0]) pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False) pysam.sort(output + ".bam", output + ".sorted", catch_stdout=False) pysam.index(output + ".sorted.bam", catch_stdout=False) return output + ".sorted.bam"
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam") localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted") samToBamFile(self.samFile, localBamFile) pysam.sort(localBamFile, localSortedBamFile) pysam.index(localSortedBamFile + ".bam") pysam.faidx(self.referenceFastaFile) file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] + "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1] consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf") consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq") system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \ % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf)) system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq)) system("rm -rf %s" % (self.referenceFastaFile + ".fai")) formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq") formatConsensusFastq(consensus_fastq, formatted_consensus_fastq) system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq)) self.finish()
def main(infile, snp_dir, max_window=MAX_WINDOW_DEFAULT, is_paired_end=False, is_sorted=False): name_split = infile.split(".") if len(name_split) > 1: pref = ".".join(name_split[:-1]) else: pref = name_split[0] if not is_sorted: pysam.sort(infile, pref + ".sort") infile = pref + ".sort" sort_file_name = pref + ".sort.bam" else: sort_file_name = infile keep_file_name = pref + ".keep.bam" remap_name = pref + ".to.remap.bam" remap_num_name = pref + ".to.remap.num.gz" if is_paired_end: fastq_names = [pref + ".remap.fq1.gz", pref + ".remap.fq2.gz"] else: fastq_names = [pref + ".remap.fq.gz"] bam_data = BamScanner(is_paired_end, max_window, sort_file_name, keep_file_name, remap_name, remap_num_name, fastq_names, snp_dir) bam_data.run()
def bwa_mem(pe1_path, pe2_path, genome_path, threads, output_path): print 'Aligning with bwa mem' start = time() work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") stderr_file = open(output_path+'.bwa.1','w') #null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=stderr_file) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "mem", "-t", threads, genome_db, pe1_path, pe2_path ], stdout=bwa_file, stderr=stderr_file) elapsed = time() - start print 'Time elapsed for bwa mem: ', elapsed sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam') shutil.rmtree(work_dir)
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path): work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null) with open(pe1_output, "w") as pe1_file: subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null) with open(pe2_output, "w") as pe2_file: subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "sampe", "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout=bwa_file, stderr=null) sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam')
def saveReads(dataHub, nameExtra=None): if dataHub.args.save_reads: logging.info("* Saving relevant reads *") for i, sample in enumerate(dataHub): outbam_path = dataHub.args.save_reads if not outbam_path.endswith(".bam"): outbam_path += ".bam" if len(dataHub.samples) > 1: logging.debug("Using i = {}".format(i)) outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i)) if nameExtra is not None: outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra)) logging.info(" Outpath: {}".format(outbam_path)) # print out just the reads we're interested for use later bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam) for read in sample.reads: bam_small.write(read) for read in sample.readStatistics.reads: bam_small.write(read) bam_small.close() sorted_path = outbam_path.replace(".bam", ".sorted") pysam.sort(outbam_path, sorted_path) pysam.index(sorted_path+".bam")
def bwa_sampe(pe1_path, pe2_path, genome_path, output_path): print 'Aligning with bwa aln/sampe' start = time() work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null) with open(pe1_output, "w") as pe1_file: subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null) with open(pe2_output, "w") as pe2_file: subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "sampe", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout=bwa_file, stderr=null) elapsed = time() - start print 'Time elapsed for bwa aln/sampe: ', elapsed sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam')
def check_bam(bam, p, make_new_index=False): """ Sort and index bam file returns dictionary of chromosome names and lengths """ # check if sorted test_head = pysam.AlignmentFile(bam, 'rb') chrom_sizes = {} p = str(p) for i in test_head.header['SQ']: chrom_sizes[i['SN']] = int(i['LN']) try: test_head.header['HD']['SO'] except KeyError: print ' sorting bam file' pysam.sort('-@', p, bam, 'sorted.temp') os.remove(bam) os.rename('sorted.temp.bam', bam) else: if test_head.header['HD']['SO'] == 'coordinate': pass else: print ' sorting bam file' pysam.sort('-@', p, bam, 'sorted.temp') os.remove(bam) os.rename('sorted.temp.bam', bam) test_head.close() # check if indexed if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False: pass else: print ' indexing bam file' pysam.index(bam) return chrom_sizes
def run_cufflinks(org_db, num_cpus=4): """ run cufflinks program on mapped reads """ try: subprocess.call(["cufflinks"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `Cufflinks` binary is in your $PATH") org_name = org_db['short_name'] print("preparing for cufflinks run for organism %s" % org_name) min_intron_length = 20 min_isoform_frac = 0.25 max_intron_length = org_db['max_intron_len'] result_dir = org_db['read_assembly_dir'] bam_file = "%s/%s_Aligned_mmr_sortbyCoord.bam" % (org_db['read_map_dir'], org_name) if not os.path.isfile(bam_file): sys.stdout.write("failed to fetch sorted mmr BAM file for organism: %s, trying to get the mmr file...\n" % org_name) bam_file = "%s/%s_Aligned_mmr.bam" % (org_db['read_map_dir'], org_name) if not os.path.isfile(bam_file): exit("error: failed to fetch mmr BAM file for organism %s" % org_name) ## sorting, indexing the bam file file_prefix, ext = os.path.splitext(bam_file) sorted_bam = "%s_sortbyCoord" % file_prefix sys.stdout.write("trying to sort based by the coordinates with output prefix as: %s\n" % sorted_bam) if not os.path.isfile("%s.bam" % sorted_bam): pysam.sort(bam_file, sorted_bam) bam_file = "%s.bam" % sorted_bam print('using bam file from %s' % bam_file) if not os.path.exists(bam_file + ".bai"): pysam.index(bam_file) ## always use quiet mode to avoid problems with storing log output. cli_cuff = "cufflinks -q --no-update-check \ -F %.2f \ -I %d \ --min-intron-length %d \ --library-type fr-unstranded \ -p %d \ -o %s \ %s" % (min_isoform_frac, max_intron_length, min_intron_length, num_cpus, result_dir, bam_file) sys.stdout.write('\trun cufflinks as: %s \n' % cli_cuff) try: os.chdir(result_dir) process = subprocess.Popen(cli_cuff, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode except Exception, e: print 'Error running cufflinks.\n%s' % str( e )
def convert_sam_to_bam(): """ This method should take a newly create .sam file from alignment and - convert it to .bam - sort .bam - index .bam """ ids = generate_ids() for id in ids: start_time = time() print 'converting: %s'%id base_path = os.path.join(SAMPLE_DIR, id) sam_path = os.path.join(base_path, id+'-bwape.sam') bam_path = os.path.join(base_path, id+'-bwape.bam') bam_content = pysam.view('-bS', sam_path) bam_file = open(bam_path, 'w+') bam_file.writelines(bam_content) bam_file.close() pysam.sort(bam_path, bam_path+'_sorted') pysam.index(bam_path+'_sorted.bam') # indexing creates file.bam.bam. Move it to file.bam bam_call = "mv {0} {1}".format(bam_path+'_sorted.bam', bam_path) index_call = "mv {0} {1}".format(bam_path+'_sorted.bam.bai', bam_path+'.bam.bai') subprocess.call(bam_call, shell=True) subprocess.call(index_call, shell=True) end_time = time() print 'completed: %.3fs'%(end_time-start_time)
def main(): # Read options, args. parser = optparse.OptionParser() (options, args) = parser.parse_args() input_fname, output_fname = args slots = os.getenv('GALAXY_SLOTS', 1) pysam.sort("-@%s" % slots, '-o', output_fname, '-O', 'bam', '-T', '.', input_fname)
def extend_bam(bam, type, reheader, size=0): bam_prefix = bam.split(".bam")[0] bam_file = pysam.Samfile(bam, 'rb') tmp_name = bam_prefix + ".bed" tmp_bed = open(tmp_name, 'w') size_name = str(size) if size == 0 : size_name = "insert" out_name = "_".join([bam_prefix, type, size_name]) + ".bam" out_bam = open(out_name, 'w') #pdb.set_trace() ## Convert BAM to temporary BED try: print "BAM to BED..." if type=="extend": bamToFragmentBed(bam_file, tmp_bed, size) elif type=="dyad": trimToDyad(bam_file, tmp_bed, size) except: print "BAM to BED conversion failed." print ">> " + ":".join(sys.exc_info()[1]) tmp_bed.close() out_bam.close() os.remove(tmp_name) return else: print "BAM to BED conversion successful." tmp_bed.close() #out_bam.close() ## Convert tmp bed to bam bedToBam(tmp_name, out_name) ## Replace header if reheader: cmd_args1 = ['samtools', 'view', '-h', bam] cmd_args2 = ['samtools', 'reheader', '-', out_name] tmp_name = bam_prefix + "_tmp" tmp = open(tmp_name, 'w') try: print "Reheader..." p1 = Popen(cmd_args1, stdout=PIPE) p2 = Popen(cmd_args2, stdin=p1.stdout, stdout=tmp) p2.wait() except: print "Failed reheader" tmp.close() os.remove(tmp_name) return else: #os.remove(bam) tmp.close() #os.rename(tmp_name, out_name) print "Sorting..." pysam.sort(out_name, out_name + "_sort") os.rename(out_name + "_sort.bam", out_name) pysam.index(out_name)
def run_mmr(org_name, read_map_dir, threads=3): """ a pythonic wrapper for multiple mapper resolution program @args org_name: Organism name, example case A_thaliana @type org_name: str @args read_map_dir: directory where the STAR bam (aligned reads) file located @type read_map_dir: str @args threads: number of threads to use for the run (default: 3) @type threads: int """ import pysam try: subprocess.call(["mmr"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `mmr` binary is in your $PATH") ## mmr works well with bam file sorted by read id bam_file = "%s/%s_Aligned.sortedByName.out.bam" % (read_map_dir, org_name) if not os.path.isfile(bam_file): sys.stdout.write( "warning: failed to fetch read id sorted BAM file for organism: %s, trying to get the raw alignment file\n" % org_name ) bam_file = "%s/%s_Aligned.out.bam" % (read_map_dir, org_name) ## unsorted bam file from STAR output if not os.path.isfile(bam_file): exit("error: failed to fetch STAR read alignment file for %s %s\n" % (org_name, bam_file)) ## sorting bam file sorted_bam = "%s/%s_Aligned.sortedByName.out" % (read_map_dir, org_name) if not os.path.isfile("%s.bam" % sorted_bam): sys.stdout.write("trying to sort based by read id with output prefix as: %s\n" % sorted_bam) pysam.sort("-n", bam_file, sorted_bam) bam_file = "%s.bam" % sorted_bam sys.stdout.write("using bam file from %s\n" % bam_file) outFile = "%s/%s_Aligned_mmr.bam" % (read_map_dir, org_name) iterations = 3 ## provide a bam file sorted by read id cli_mmr = "module load gcc; mmr -b -p -V -t %d -I %d -o %s %s" % (threads, iterations, outFile, bam_file) try: sys.stdout.write("\trun MMR as: %s \n" % cli_mmr) ## changing the working dir to run mmr os.chdir(read_map_dir) process = subprocess.Popen(cli_mmr, shell=True) returncode = process.wait() if returncode != 0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("MMR run finished. result file stored at %s\n" % outFile) except Exception, e: exit("Error running MMR.\n%s" % str(e))
def run(self): # Phase 1 - Detection of BarCode self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),)) self.bc.simple_approach() sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),)) self.bc.write_barcodes(self.barcodes) sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),)) # Phase 2 - Rewrite BAM sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.bam.reset() self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) # Phase 3 - Build Consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build() sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) # Phase 4 - Call Variants and Haplotypes self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) # Phase 5 - Summary Statistics and Chain Files f_out = open(self.out, "w") self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
def sort_bam(bamfilename_sorted, bamfilename_unsorted=None): '''Sort BAM file''' import pysam if bamfilename_unsorted is None: bamfilename_unsorted = bamfilename_sorted[:-11]+'.bam' pysam.sort(bamfilename_unsorted, bamfilename_sorted[:-4])
def main(): parser=argparse.ArgumentParser() parser.add_argument("-p", action='store_true', dest='is_paired_end', default=False, help=('Indicates that reads are ' 'paired-end (default is single).')) parser.add_argument("-s", action='store_true', dest='is_sorted', default=False, help=('Indicates that the input bam file' ' is coordinate sorted (default ' 'is False).')) mdefault = 100000 mhelp = ('Changes the maximum window to search for SNPs. The default is ' '{:,} base pairs. Reads or read pairs that span more than this ' 'distance (usually due to splice junctions) will be thrown out. ' 'Increasing this window allows for longer junctions, but may ' 'increase run time and memory requirements.'.format(mdefault)) parser.add_argument("-m", action='store', dest='max_window', type=int, default=mdefault, help=mhelp) parser.add_argument("infile", action='store', help=("Coordinate sorted bam " "file.")) snp_dir_help = ('Directory containing the SNPs segregating within the ' 'sample in question (which need to be checked for ' 'mappability issues). This directory should contain ' 'sorted files of SNPs separated by chromosome and named: ' 'chr<#>.snps.txt.gz. These files should contain 3 columns: ' 'position RefAllele AltAllele') parser.add_argument("snp_dir", action='store', help=snp_dir_help) options = parser.parse_args() infile = options.infile snp_dir = options.snp_dir name_split = infile.split(".") if len(name_split) > 1: pref = ".".join(name_split[:-1]) else: pref = name_split[0] if not options.is_sorted: pysam.sort(infile, pref + ".sort") infile = pref + ".sort" sort_file_name = pref + ".sort.bam" else: sort_file_name = infile keep_file_name = pref + ".keep.bam" remap_name = pref + ".to.remap.bam" remap_num_name = pref + ".to.remap.num.gz" if options.is_paired_end: fastq_names = [pref + ".remap.fq1.gz", pref + ".remap.fq2.gz"] else: fastq_names = [pref + ".remap.fq.gz"] bam_data = BamScanner(options.is_paired_end, options.max_window, sort_file_name, keep_file_name, remap_name, remap_num_name, fastq_names, snp_dir) bam_data.run()
def sort(self): msg = "Sorting %s" % self.bamfn print(msg) tempfn_stem = os.path.join(self.basedir, temp_filename()) print self.bamfn, tempfn_stem pysam.sort(self.bamfn, tempfn_stem) tempfn_glob = glob.glob(tempfn_stem + '*') assert len(tempfn_glob) == 1, "Unexpected number of temporary output files: %r" % tempfn_glob tempfn = tempfn_glob[0] # rename our sorted bamfn os.rename(tempfn, self.bamfn)
def sort_and_rewrite_bam(self): self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
def __sort(self): if not os.path.isfile(bamfile): try: pysam.sort(self.samfile, self.samfile + 'sorted') sort_log = pysam.sort.getMessage() return True except: raise RuntimeError() else: print "already sorted" return False
def bowtie_align(b_path,read,ref,s_path,bowtie2,numOfThreads,nOffrate,reftype,recovering): # b_path: bowtie path; # s_path: samtools path; # bowtie2: logic, true/false # offrate is not used due to bowtie2 bug sam=read.split("/")[-1].split(".")[0]+".sam" hasFile = False if recovering and os.path.isfile("sort_" + read.split("/")[-1].split(".")[0] + ".bam"): # old file exists try: align = pysam.Samfile("sort_" + read.split("/")[-1].split(".")[0] + ".bam", "rb") hasFile = True except: hasFile = False if (not recovering) or (not hasFile): print >> sys.stderr, 'Start mapping.' if read.split(".")[-1] in ["fa","fasta"]: # allow fasta and fastq for read foption=" -f" else: foption="" if ref.split(".")[-1] in ["fa","fasta"]: base=ref.split("/")[-1].split(".")[0] os.system("rm "+read.split("/")[-1].strip()+".log") os.system(b_path+"-build "+ref+" "+base+" >> "+read+".log 2>&1") if not bowtie2: os.system(b_path+ foption+" -a --best --strata -n 1 -l 15 -e 200 -p " + str(numOfThreads) + " -S "+base+" "+read+" "+sam+" >> "+read.split("/")[-1]+".log 2>&1") else: os.system(b_path+ " -x "+base+foption+" -U "+read+ " -p " + str(numOfThreads) + " -i S,1,0.50 0R 3 -L 15 -D 20 -t " + ("-a " if reftype != "genome" else "") + "-S "+sam+" >> "+read.split("/")[-1]+".log 2>&1") else: os.system("rm "+read.split("/")[-1].strip()+".log") if not bowtie2: os.system(b_path+ foption+" -a --best --strata -n 1 -l 15 -e 200 -p " + str(numOfThreads) + " -S "+ref+" "+read+" "+sam+" >> "+read.split("/")[-1]+".log 2>&1") else: os.system(b_path+ " -x "+ref+foption+" -U "+read + " -p " + str(numOfThreads) + " -L 15 -D 20 -t " + ("-a " if reftype != "genome" else "") + "-S "+sam+" >> "+read.split("/")[-1]+".log 2>&1") bam=read.split("/")[-1].split(".")[0]+".bam" os.system(s_path+ " view -Sb -o "+bam +" "+sam) os.system("rm "+sam) pysam.sort("-n",bam,"temp") align=pysam.Samfile("temp.bam","rb") os.system("rm temp.bam") os.system(s_path+ " sort "+bam+ " "+"sort_"+read.split("/")[-1].split(".")[0]) os.system("rm "+bam) print >> sys.stderr, 'Mapping completed.' else: print >> sys.stderr, 'Old file exists, recovery in process.' return align
def SAM_to_BAM(samfile_name, bamfile_name): '''Converts a SAM file into an ordered and indexed BAM file.''' unsortedbamfile_name = samfile_name[:-4] + "_unsorted.bam" bamfile = open(unsortedbamfile_name, "wb") bamfile.write(pysam.view("-b", "-S", samfile_name)) bamfile.close() if bamfile_name.endswith(".bam"): bamfile_name = bamfile_name[:-4] pysam.sort(unsortedbamfile_name, bamfile_name) pysam.index(bamfile_name + ".bam")
def main(): args = parser.parse_args() samfile = pysam.Samfile(args.bam, 'rb') junctionreads = pysam.Samfile(args.out_bam, 'wb', template=samfile) id_tag = args.group_on chosen_feature = args.feature if args.cufflinks: gff = GFFReader(args.gff, preset='cufflinks') else: gff = GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'}) written = set([]) for feature_name, feature in gff.get_features(): try: children = feature.children except AttributeError: continue if len(children) > 1: starts = dict([(j.start, j) for i,v in children.iteritems() for j in v.parts()]) if len(starts) > 1: parts = [(v.seqid, v.start, v.end) for i,v in starts.iteritems()] parts.sort(key=lambda x: x[1]) for ri, read in enumerate(parts[:-1]): read2 = parts[ri+1] reads = set([]) reads2 = set([]) read_dict = {} try: for i in samfile.fetch(read[0], int(read[2])-1, read[2]): if not i.overlap(int(read[2])-1, int(read[2])) or i.qname in written: continue reads.add(i.qname) read_dict[i.qname] = i # if not i.mate_is_unmapped: # mate = samfile.mate(i) # reads.add(mate.qname) # read_dict[mate.qname] = mate for i in samfile.fetch(read2[0], read2[1], int(read2[1])+1): if not i.overlap(int(read2[2])-1, int(read2[2])) or i.qname in written: continue reads2.add(i.qname) read_dict[i.qname] = i # if not i.mate_is_unmapped: # mate = samfile.mate(i) # reads2.add(mate.qname) # read_dict[mate.qname] = mate for i in reads&reads2: written.add(i) junctionreads.write(read_dict[i]) except ValueError: continue pysam.sort(args.out_bam, '%s_sort'%args.out_bam) pysam.index('%s_sort.bam'%args.out_bam)
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging emptyQual = False for entry in samIterator(pysam.Samfile(self.samFile, "r")): if entry.qual is None: emptyQual = True if emptyQual is False: localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam") localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted") samToBamFile(self.samFile, localBamFile) pysam.sort(localBamFile, localSortedBamFile) system("qualimap bamqc -bam %s -outdir %s" % (localSortedBamFile + ".bam", self.outputDir)) self.finish()
def map_reads(pe1_path, pe2_path, genome_path, output_path): work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") subprocess.call(["bwa", "index", "-p", genome_db, genome_path]) with open(pe1_output, "w") as pe1_file: subprocess.call(["bwa", "aln", genome_db, pe1_path], stdout=pe1_file) with open(pe2_output, "w") as pe2_file: subprocess.call(["bwa", "aln", genome_db, pe2_path], stdout=pe2_file) with open(bwa_output, "w") as bwa_file: subprocess.call([ "bwa", "sampe", "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout=bwa_file) sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path)
def bowtie2(bam_trimmed: str, fasta: str, sam_aligned: str) -> str: """Sort BAM using Picard then align to reference using bowtie2""" if not sam_aligned.endswith("_aligned.sam"): log.error("sam_aligned must end in '_aligned.sam'.") bam_sorted = bam_trimmed.replace("_trimmed.bam", "_sorted.bam") pysam.sort("-n", "-o", bam_sorted, bam_trimmed) # Sort by QNAME pysam.index(bam_sorted) index_prefix = os.path.basename(fasta.split('.')[0]) index_prefix = os.path.join(os.path.dirname(sam_aligned), index_prefix) command = f"bowtie2-build -f {fasta} --threads 8 {index_prefix}" log.sep() execute(command) command = \ f""" bowtie2 -a --very-sensitive-local --norc \ -x {index_prefix} \ -p 8 \ -b {bam_sorted} \ -S {sam_aligned} \ """ execute(command) log.sep() return sam_aligned
def _compress_bam(bam_input, bam_output, ref_fname, regions=None, threads=1): """Compress a bam into run length encoding (RLE). :param bam_input: str, name of the bam file to be compressed :param bam_output: str, name of the bam to be produced :param ref_fname: str, reference filename, used to produce bam_input :param regions: list, genomic regions to be extracted :param threads: int, number of workers to be used :returns: None """ regions = medaka.common.get_regions(bam_input, regions) ref_fasta = pysam.FastaFile(ref_fname) with pysam.AlignmentFile(bam_input, 'r') as alignments_bam: tmp_output = '{}.tmp'.format(bam_output) with pysam.AlignmentFile(tmp_output, 'wb', header=alignments_bam.header) as output: for region in regions: bam_current = alignments_bam.fetch(reference=region.ref_name, start=region.start, end=region.end) ref_sequence = ref_fasta.fetch(region.ref_name) ref_rle = RLEConverter(ref_sequence) func = functools.partial(_compress_alignment, ref_rle=ref_rle) with concurrent.futures.ThreadPoolExecutor( max_workers=threads) as executor: for chunk in medaka.common.grouper(bam_current, 100): for new_alignment in executor.map(func, chunk): if new_alignment is not None: output.write(new_alignment) pysam.sort("-o", bam_output, tmp_output) os.remove(tmp_output) pysam.index(bam_output)
def bam_diff(f1, f2, T_TEST_DIR): basename, ext = os.path.splitext(os.path.basename(f1)) f1sorted = T_TEST_DIR + basename + '.f1.sorted.bam' f2sorted = T_TEST_DIR + basename + '.f2.sorted.bam' pysam.sort(f1, '-n', '-o', f1sorted) pysam.sort(f2, '-n', '-o', f2sorted) f1sam = T_TEST_DIR + basename + '.f1.sam' f2sam = T_TEST_DIR + basename + '.f2.sam' fhq = open(f1sam, "w") fhq.write(pysam.view('-h', f1sorted)) fhq.close() fhq = open(f2sam, "w") fhq.write(pysam.view('-h', f2sorted)) fhq.close() subprocess.Popen([ 'sed', '-i', '-r', 's@(SA:[^\\t]+)\\t(LB:[^\\t]+)\t(RG:[^\\t]+)@\\3\\t\\1\\t\\2@', f2sam ], stdout=subprocess.PIPE).stdout.read() subprocess.Popen(['sed', '-i', '-r', 's@\\tFI:i:[0-9]+@@', f1sam], stdout=subprocess.PIPE).stdout.read() subprocess.Popen(['sed', '-i', '-r', 's@\\tFI:i:[0-9]+@@', f2sam], stdout=subprocess.PIPE).stdout.read() # one time only # subprocess.Popen(['sed', '-i' , '-r', 's@\\tSA:Z:[^\\t]+@@', f1sam], stdout=subprocess.PIPE).stdout.read() # subprocess.Popen(['sed', '-i' , '-r', 's@\\tSA:Z:[^\\t]+@@', f2sam], stdout=subprocess.PIPE).stdout.read() return filecmp.cmp(f1sam, f2sam), f1sam, f2sam
def preprocess_sam(sam_files, datasets, tmp_dir, n_threads = 0): """ Copy and rename the provided SAM/BAM file(s), merge them, and index. This is necessary in order to use Pybedtools commands on the reads. The renaming is necessary in order to label the reads according to their dataset.""" # Create the tmp dir os.system("mkdir -p %s " % (tmp_dir)) # Copy and rename SAM files with dataset names to ensure correct RG tags renamed_sams = [] for sam, dataset in zip(sam_files, datasets): suffix = "." + sam.split(".")[-1] if suffix == ".sam": bam_copy = tmp_dir + dataset + "_unsorted.bam" convert_to_bam(sam, bam_copy) sam = bam_copy sorted_bam = tmp_dir + dataset + ".bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam) renamed_sams.append(sorted_bam) merged_bam = tmp_dir + "merged.bam" merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)] # index_args = [merged_bam, "-@", str(n_threads)] # Merge datasets and use -r option to include a read group tag try: pysam.merge(*merge_args) pysam.index(merged_bam) ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Merged input SAM/BAM files" % (ts)) except: raise RuntimeError(("Problem merging and indexing SAM/BAM files. " "Check your file paths and make sure that all " "files have headers.")) return merged_bam
def bam_merge(bam_ins, bam_out): """ merge multiple bam files input: list of bam files input: out.bam """ # check input files bam_flag = [] for b in bam_ins: if not os.path.exists(b) is True: bam_flag.append(b) if len(bam_flag) > 0: sys.exit('BAM files not exists:' + '\n'.join(bam_flag)) # check output file if os.path.exists(bam_out) is True: pass # sys.exit('BAM exists:' + bam_out) else: # merge pysam.merge('-f', bam_out + '.unsorted.bam', *bam_ins) # overwrite output BAM pysam.sort('-o', bam_out, bam_out + '.unsorted.bam') pysam.index(bam_out) os.remove(bam_out + '.unsorted.bam')
def main(opts): """ Main function open samfile, collect the correct header and write output as bam file. A sorted Bamfile is also outputted. Function also flags the XA marked regions in the samfile, which can then be excluded in the variant calling. """ samfile = pysam.Samfile(opts.sam_file, 'r') new_header = samfile.header.copy() final_header = complete_header(opts.read_group_tags, samfile, new_header) # "wb" here means write as bam file. if not opts.output.endswith(".bam"): opts.output = opts.output + ".bam" writer_piet = pysam.AlignmentFile(opts.output, "wb", header=final_header) for read in samfile: tags = dict(read.tags) if "XA" in tags: read.is_qcfail = True writer_piet.write(read) writer_piet.close() samfile.close() print("Bam file created, continuing to sort bam file....") path_and_name = os.path.split(opts.output) path_and_name = list(path_and_name) sorted_bam_out = os.path.join(path_and_name[0], "sorted_" + path_and_name[1]) pysam.sort("-o", sorted_bam_out, opts.output) print("Done, Sorted bam file created")
def indexBamFile(): ## indexing bam files to use pysam logging.debug("indexing BAM File function..") bamFile = 0 ## currently not supporting bam file input for rr in range(0, len(sample_1)): ## for each replicate of sample_1 rTempFolder = s1rPath + str(rr + 1) bam_fn = '' if bamFile == 0: ## we know the location of the bam file bam_fn = rTempFolder + '/Aligned.sortedByCoord.out.bam' else: ## bam file is provided bam_fn = sample_1[rr] if LooseVersion( pysam.version.__samtools_version__) < LooseVersion('1.3'): pysam.sort(bam_fn, rTempFolder + '/aligned.sorted') ## it will make aligned.sorted.bam file pysam.index(rTempFolder + '/aligned.sorted.bam') ## it will make aligned.sorted.bam.bai file else: pysam.sort(bam_fn, '-o', rTempFolder + '/aligned.sorted.bam') ## it will make aligned.sorted.bam file pysam.index(rTempFolder + '/aligned.sorted.bam') ## it will make aligned.sorted.bam.bai file for rr in range(0, len(sample_2)): ## for each replicate of sample_2 rTempFolder = s2rPath + str(rr + 1) bam_fn = '' if bamFile == 0: ## we know the location of the bam file bam_fn = rTempFolder + '/Aligned.sortedByCoord.out.bam' else: ## bam file is provided bam_fn = sample_2[rr] if LooseVersion( pysam.version.__samtools_version__) < LooseVersion('1.3'): pysam.sort(bam_fn, rTempFolder + '/aligned.sorted') ## it will make aligned.sorted.bam file pysam.index(rTempFolder + '/aligned.sorted.bam') ## it will make aligned.sorted.bam.bai file else: pysam.sort(bam_fn, '-o', rTempFolder + '/aligned.sorted.bam') ## it will make aligned.sorted.bam file pysam.index(rTempFolder + '/aligned.sorted.bam')
def merge_bam(self, data_dir, project_id, final_id, run_ids=[]): """ Merge together all the bams in a directory and sort to create the final bam ready to be filtered If run_ids is blank then the function looks for all bam files in the data_dir """ out_bam_file = data_dir + project_id + '/' + final_id + '.bam' if len(run_ids) == 0: bam_files = [ f for f in listdir(data_dir + project_id) if f.endswith(("sai")) ] else: bam_files = [f + ".bam" for f in run_ids] bam_sort_files = [] bam_merge_files = [] for bam in bam_files: bam_loc = data_dir + project_id + '/' + bam bam_sort_files.append(bam_loc) bam_merge_files.append(bam_loc) for bam_sort_file in bam_sort_files: print bam_sort_file pysam.sort("-o", str(bam_sort_file), str(bam_sort_file)) if len(bam_sort_files) == 1: pysam.sort("-o", str(out_bam_file), str(bam_sort_files[0])) else: pysam.merge(out_bam_file, *bam_merge_files) pysam.sort("-o", str(out_bam_file), "-T", str(out_bam_file) + ".bam_sort", str(out_bam_file)) pysam.index(str(out_bam_file))
RG_idx = 1 for chrom in sorted(panel_regions): for curr_area in panel_regions[chrom]: new_header["RG"].append({ "ID": str(RG_idx), args.RG_tag: curr_area.name }) RG_id_by_source[curr_area.name] = str(RG_idx) RG_idx += 1 # Parse reads with pysam.AlignmentFile(tmp_aln, "wb", header=new_header) as FH_out: if args.single_mode: log_data = processSingleReads(FH_in, panel_regions, RG_id_by_source, args) else: log_data = processPairedReads(FH_in, panel_regions, RG_id_by_source, args) # Sort output file pysam.sort("-o", args.output_aln, tmp_aln) pysam.index(args.output_aln) os.remove(tmp_aln) # Write summary if args.output_summary is not None: if args.summary_format == "json": writeJSONSummary(args.output_summary, log_data) else: writeTSVSummary(args.output_summary, log_data) log.info("End of job")
def run_bowtie2( reads_fwd, reads_rev, ref_fa, out_prefix, threads=1, max_insert=1000, sort=False, bowtie2='bowtie2', bowtie2_preset='very-sensitive-local', bowtie2_version=None, verbose=False, verbose_filehandle=sys.stdout, remove_both_unmapped=False, clean_index=True, ): ref_is_indexed = True for ext in bowtie2_index_extensions: if not os.path.exists(ref_fa + '.' + ext): ref_is_indexed = False break clean_files = [] if ref_is_indexed: if verbose: print('Bowtie2 index files found (', ref_fa, '.*.bt2) so no need to index', sep='', file=verbose_filehandle) map_index = ref_fa else: map_index = out_prefix + '.map_index' bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle) if clean_index: clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']] final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd = [ bowtie2, '--threads', str(threads), '--reorder', '--' + bowtie2_preset, '-X', str(max_insert), '-x', map_index, '-1', reads_fwd, '-2', reads_rev, ] if LooseVersion(bowtie2_version) >= LooseVersion('2.3.1'): map_cmd.append('--score-min G,1,10') # We use gawk instead of awk here as we need bitwise comparisons # and these are not available via awk on Mac OSX. if remove_both_unmapped: map_cmd.append(r''' | gawk ' !(and($2,4)) || !(and($2,8)) ' ''') tmp_sam_file = out_prefix + '.unsorted.sam' map_cmd.append(' > ' + tmp_sam_file) map_cmd = ' '.join(map_cmd) common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle) if verbose: print('Converting', tmp_sam_file, '->', intermediate_bam, file=verbose_filehandle) infile = pysam.AlignmentFile(tmp_sam_file, "r") outfile = pysam.AlignmentFile(intermediate_bam, "wb", template=infile) for x in infile: outfile.write(x) infile.close() outfile.close() os.unlink(tmp_sam_file) if sort: if verbose: print('Sorting', intermediate_bam, '->', final_bam, file=verbose_filehandle) pysam.sort('-o', final_bam, '-O', 'BAM', intermediate_bam) if verbose: print('Indexing', final_bam, file=verbose_filehandle) pysam.index(final_bam) clean_files.append(intermediate_bam) for fname in clean_files: os.unlink(fname)
# Write to bam outf.write(read) reader.close() del ra import gc gc.collect() # Remove bam file to save space #os.remove(in_bam) # Sort tagged file print('Sorting bam file...') sorted_file = prefix + '_Aligned.tagged.sorted.bam' pysam.sort("-o", sorted_file, '--threads', str(no_cores), out_bam) # Remove unsorted file #os.remove(out_bam) # Index args = f'samtools index {sorted_file}' with subprocess.Popen(args, shell=True) as p: out, err = p.communicate() # RNA velocity print('Computing RNA velocity...') out_dir = sys.argv[2] #f'{prefix}/velocity' args = f'velocyto run -b /data/peer/chanj3/SCPC_transformation/ref/737K-august-2016.txt \ -o {out_dir} -@ {no_cores} -v {sorted_file} /data/peer/chanj3/SCPC_transformation/ref/annotations.gtf' with subprocess.Popen(args, shell=True) as p:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) group = U.OptionGroup(parser, "dedup-specific options") group.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options, group=False) if options.random_seed: np.random.seed(options.random_seed) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = options.stdout.name options.stdout.close() else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename(dir=options.tmpdir) sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" if options.stats and options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) if options.paired: outfile = sam_methods.TwoPassPairWriter(infile, outfile) nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_contig and options.gene_transcript_map: metacontig2contig = sam_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = sam_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch() # set up ReadCluster functor with methods specific to # specified options.method processor = network.ReadDeduplicator(options.method) bundle_iterator = sam_methods.get_bundles( options, metacontig_contig=metacontig2contig) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = umi_methods.random_read_generator( infile.filename, chrom=options.chrom, barcode_getter=bundle_iterator.barcode_getter) for bundle, key, status in bundle_iterator(inreads): nInput += sum([bundle[umi]["count"] for umi in bundle]) while nOutput >= output_reads + 100000: output_reads += 100000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) if options.stats: # generate pre-dudep stats average_distance = umi_methods.get_average_umi_distance( bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: nOutput += 1 outfile.write(bundle[umi]["read"]) else: # dedup using umis and write out deduped bam reads, umis, umi_counts = processor(bundle=bundle, threshold=options.threshold) for read in reads: outfile.write(read) nOutput += 1 if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [ bundle_iterator.barcode_getter(x)[0] for x in reads ] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = umi_methods.get_average_umi_distance( post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = umi_methods.get_average_umi_distance( random_umis) post_cluster_stats_null.append(average_distance_null) outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.stats: # generate the stats dataframe stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # tally the counts per umi per position pre_counts = collections.Counter(stats_pre_df["counts"]) post_counts = collections.Counter(stats_post_df["counts"]) counts_index = list( set(pre_counts.keys()).union(set(post_counts.keys()))) counts_index.sort() with U.openFile(options.stats + "_per_umi_per_position.tsv", "w") as outf: outf.write("counts\tinstances_pre\tinstances_post\n") for count in counts_index: values = (count, pre_counts[count], post_counts[count]) outf.write("\t".join(map(str, values)) + "\n") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int agg_df = agg_df.fillna(0).astype(int) agg_df.index = [x.decode() for x in agg_df.index] agg_df.index.name = 'UMI' agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame( { "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }, columns=[ "unique", "unique_null", options.method, "%s_null" % options.method, "edit_distance" ]) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i" % nOutput) if not options.ignore_umi: # otherwise processor has not been used U.info("Total number of positions deduplicated: %i" % processor.UMIClusterer.positions) if processor.UMIClusterer.positions > 0: U.info("Mean number of unique UMIs per position: %.2f" % (float(processor.UMIClusterer.total_umis_per_position) / processor.UMIClusterer.positions)) U.info("Max. number of unique UMIs per position: %i" % processor.UMIClusterer.max_umis_per_position) else: U.warn("The BAM did not contain any valid " "reads/read pairs for deduplication") U.Stop()
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update( [line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update( [line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int( open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES ] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all( up_to_date(cfq, fq) for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join( tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all( up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe( transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join( runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext( sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file( open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam( bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random( runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile( DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug( "\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug( "\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe( index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all( up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments( transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome( genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join( tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join( tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads( discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file, ) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters( discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file, ) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints( index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join( tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments( cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join( runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % ( unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras( input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
# os.remove("hisat2.G2A.sam") unique_num = C2T_num + G2A_num total_reads = unique_num + multimapper_num + unmapped_num sys.stderr.write("[%s]Completed successfully:\n" % strftime("%Y-%m-%d %H:%M:%S", time.localtime())) sys.stderr.write(" Total reads: %d\n" % total_reads) sys.stderr.write(" Unique mapping: %d (%.3f%%)\n" % (unique_num, 100 * unique_num / (total_reads + 0.0))) sys.stderr.write(" C2T: %d (%.2f%%)\n" % (C2T_num, 100 * C2T_num / (total_reads + 0.0))) sys.stderr.write(" G2A: %d (%.2f%%)\n" % (G2A_num, 100 * G2A_num / (total_reads + 0.0))) sys.stderr.write(" Multiple mapping: %d (%.3f%%)\n" % (multimapper_num, 100 * multimapper_num / (total_reads + 0.0))) sys.stderr.write(" Unmapped: %d (%.3f%%)\n" % (unmapped_num, 100 * unmapped_num / (total_reads + 0.0))) if options.sorted_bam == True: sys.stderr.write("[%s]Sorting bam...\n" % strftime("%Y-%m-%d %H:%M:%S", time.localtime())) pysam.sort("-o", options.output + ".sorted.bam", options.output + ".bam") if options.no_sorted_bam_index == False: sys.stderr.write("[%s]Indexing bam...\n" % strftime("%Y-%m-%d %H:%M:%S", time.localtime())) pysam.index(options.output + ".sorted.bam") if options.del_bam == True: os.remove(options.output + ".bam")
def main(args): numhum = nummou = numamb = 0 #starttime = time.clock() # parse inputs humanfilename = args.A mousefilename = args.B samplenameprefix = args.prefix outputdir = args.output_dir intermdir = args.intermediate_dir disablesort = args.no_sort disambalgo = args.aligner supportedalgorithms = set(['tophat', 'bwa', 'star']) # check existence of input BAM files if not (file_exists(humanfilename) and file_exists(mousefilename)): sys.stderr.write("\nERROR in disambiguate.py: Two existing input BAM files " "must be specified as positional arguments\n") sys.exit(2) if len(samplenameprefix) < 1: humanprefix = path.basename(humanfilename.replace(".bam","")) mouseprefix = path.basename(mousefilename.replace(".bam","")) else: if samplenameprefix.endswith(".bam"): samplenameprefix = samplenameprefix[0:samplenameprefix.rfind(".bam")] # the above if is not stricly necessary for this to work humanprefix = samplenameprefix mouseprefix = samplenameprefix samplenameprefix = None # clear variable if disambalgo.lower() not in supportedalgorithms: print(disambalgo+" is not a supported disambiguation scheme at the moment.") sys.exit(2) if disablesort: humanfilenamesorted = humanfilename # assumed to be sorted externally... mousefilenamesorted = mousefilename # assumed to be sorted externally... else: if not path.isdir(intermdir): makedirs(intermdir) humanfilenamesorted = path.join(intermdir,humanprefix+".speciesA.namesorted.bam") mousefilenamesorted = path.join(intermdir,mouseprefix+".speciesB.namesorted.bam") if not path.isfile(humanfilenamesorted): pysam.sort("-n","-m","2000000000",humanfilename,humanfilenamesorted.replace(".bam","")) if not path.isfile(mousefilenamesorted): pysam.sort("-n","-m","2000000000",mousefilename,mousefilenamesorted.replace(".bam","")) # read in human reads and form a dictionary myHumanFile = pysam.Samfile(humanfilenamesorted, "rb" ) myMouseFile = pysam.Samfile(mousefilenamesorted, "rb" ) if not path.isdir(outputdir): makedirs(outputdir) myHumanUniqueFile = pysam.Samfile(path.join(outputdir, humanprefix+".disambiguatedSpeciesA.bam"), "wb", template=myHumanFile) myHumanAmbiguousFile = pysam.Samfile(path.join(outputdir, humanprefix+".ambiguousSpeciesA.bam"), "wb", template=myHumanFile) myMouseUniqueFile = pysam.Samfile(path.join(outputdir, mouseprefix+".disambiguatedSpeciesB.bam"), "wb", template=myMouseFile) myMouseAmbiguousFile = pysam.Samfile(path.join(outputdir, mouseprefix+".ambiguousSpeciesB.bam"), "wb", template=myMouseFile) summaryFile = open(path.join(outputdir,humanprefix+'_summary.txt'),'w') #initialise try: nexthumread=myHumanFile.next() nextmouread=myMouseFile.next() except StopIteration: print("No reads in one or either of the input files") sys.exit(2) EOFmouse = EOFhuman = False prevHumID = '-+=RANDOMSTRING=+-' prevMouID = '-+=RANDOMSTRING=+-' while not EOFmouse&EOFhuman: while not (nat_cmp(nexthumread.qname,nextmouread.qname) == 0): # check order between current human and mouse qname (find a point where they're identical, i.e. in sync) while nat_cmp(nexthumread.qname,nextmouread.qname) > 0 and not EOFmouse: # mouse is "behind" human, output to mouse disambiguous myMouseUniqueFile.write(nextmouread) if not nextmouread.qname == prevMouID: nummou+=1 # increment mouse counter for unique only prevMouID = nextmouread.qname try: nextmouread=myMouseFile.next() except StopIteration: EOFmouse=True while nat_cmp(nexthumread.qname,nextmouread.qname) < 0 and not EOFhuman: # human is "behind" mouse, output to human disambiguous myHumanUniqueFile.write(nexthumread) if not nexthumread.qname == prevHumID: numhum+=1 # increment human counter for unique only prevHumID = nexthumread.qname try: nexthumread=myHumanFile.next() except StopIteration: EOFhuman=True if EOFhuman or EOFmouse: break # at this point the read qnames are identical and/or we've reached EOF humlist = list() moulist = list() if nat_cmp(nexthumread.qname,nextmouread.qname) == 0: humlist.append(nexthumread) nexthumread = read_next_reads(myHumanFile, humlist) # read more reads with same qname (the function modifies humlist directly) if nexthumread == None: EOFhuman = True moulist.append(nextmouread) nextmouread = read_next_reads(myMouseFile, moulist) # read more reads with same qname (the function modifies moulist directly) if nextmouread == None: EOFmouse = True # perform comparison to check mouse, human or ambiguous if len(moulist) > 0 and len(humlist) > 0: myAmbiguousness = disambiguate(humlist, moulist, disambalgo) if myAmbiguousness < 0: # mouse nummou+=1 # increment mouse counter for myRead in moulist: myMouseUniqueFile.write(myRead) elif myAmbiguousness > 0: # human numhum+=1 # increment human counter for myRead in humlist: myHumanUniqueFile.write(myRead) else: # ambiguous numamb+=1 # increment ambiguous counter for myRead in moulist: myMouseAmbiguousFile.write(myRead) for myRead in humlist: myHumanAmbiguousFile.write(myRead) if EOFhuman: #flush the rest of the mouse reads while not EOFmouse: myMouseUniqueFile.write(nextmouread) if not nextmouread.qname == prevMouID: nummou+=1 # increment mouse counter for unique only prevMouID = nextmouread.qname try: nextmouread=myMouseFile.next() except StopIteration: #print("3") EOFmouse=True if EOFmouse: #flush the rest of the human reads while not EOFhuman: myHumanUniqueFile.write(nexthumread) if not nexthumread.qname == prevHumID: numhum+=1 # increment human counter for unique only prevHumID = nexthumread.qname try: nexthumread=myHumanFile.next() except StopIteration: EOFhuman=True summaryFile.write("sample\tunique species A pairs\tunique species B pairs\tambiguous pairs\n") summaryFile.write(humanprefix+"\t"+str(numhum)+"\t"+str(nummou)+"\t"+str(numamb)+"\n") summaryFile.close() myHumanFile.close() myMouseFile.close() myHumanUniqueFile.close() myHumanAmbiguousFile.close() myMouseUniqueFile.close() myMouseAmbiguousFile.close()
def filter_reads(bam, positions, fasta_length, filter_cutoff=0.97, max_insert_relative=3, min_insert=50, min_mapq=2, write_data=None, write_bam=False): # read sets observed_read1s = set() observed_read2s = set() mapped_pairs = set() final_reads = set() # counters total_read_count = 0 total_read_pairs = 0 total_mapped_pairs = 0 mapped_read_lengths = 0 # storing data read_data = {} pair_mapqs = {} pair_mismatch = {} pair_inserts = {} samfile = pysam.AlignmentFile(bam) #for printing out a new bam file if write_bam: logging.info("Copying header for new bam...") samfile_out = pysam.AlignmentFile(bam.split("/")[-1].split(".")[0] + "_filtered.bam", "wb", template=samfile) reads_all = defaultdict(list) logging.info("READING BAM: " + bam.split("/")[-1]) logging.info("Using reads with >" + str(filter_cutoff) + "% PID to consensus reference.") ## STEP 1: collect paired reads and their information for gene in tqdm(positions, desc='Getting read pairs: '): for read in samfile.fetch(gene[0], gene[1], gene[2]): total_read_count += 1 #store all reads if we're going to write them back to a new bam file if write_bam: reads_all[read.query_name].append(read) ## If we've seen this read's pair before if (read.is_read2 and read.query_name in observed_read1s) or ( read.is_read1 and read.query_name in observed_read2s): #But if we haven't already seen this complete pair, then we can complete the pair and store the information #Also check that the pair is on the same scaffold if read.query_name not in mapped_pairs and gene[ 0] == read_data[read.query_name]['scaf']: total_read_pairs += 1 if read.get_reference_positions() != []: total_mapped_pairs += 1 mapped_pairs.add(read.query_name) #add to found #for calculating mean read length mapped_read_lengths += float( read_data[read.query_name]['len']) #set mismatch percentage pair_mismatch[read.query_name] = 1 - ( (float(read_data[read.query_name]['nm']) + float(read.get_tag('NM'))) / (float(read_data[read.query_name]['len']) + read.infer_query_length())) #set insert size if read.get_reference_positions()[-1] > read_data[ read.query_name]['start']: pair_inserts[ read. query_name] = read.get_reference_positions( )[-1] - read_data[read.query_name]['start'] else: pair_inserts[ read.query_name] = read_data[read.query_name][ 'stop'] - read.get_reference_positions()[0] #set mapq pair_mapqs[read.query_name] = read.mapping_quality if read_data[read.query_name][ 'mapq'] > read.mapping_quality: pair_mapqs[read.query_name] = read_data[ read.query_name]['mapq'] #this is the first time we see a read from this pair and don't double count elif (read.is_read1 and read.query_name not in observed_read1s) or (read.is_read2 and read.query_name not in observed_read2s): if read.get_reference_positions( ) != []: # don't use unmapped reads if read.is_read1: observed_read1s.add(read.query_name) else: observed_read2s.add(read.query_name) #record the data for this read read_data[read.query_name] = { "nm": read.get_tag('NM'), "len": read.infer_query_length(), "mapq": read.mapping_quality, "start": read.get_reference_positions()[0], 'stop': read.get_reference_positions()[-1], 'scaf': gene[0] } ## STEP 2: INSERT SIZE CUTOFF, MAPQ CUTOFF, AND MISMATCH CUTOFF mapped_read_lengths = mapped_read_lengths / total_mapped_pairs max_insert = np.median( list(pair_inserts.values()) ) * max_insert_relative #insert size should be less than max_insert_relative * median value too_short = 0.0 too_long = 0.0 good_length = 0.0 mapq_good = 0.0 filter_cutoff_good = 0.0 logging.info("Filtering reads...") for read_pair in mapped_pairs: if pair_inserts[read_pair] > min_insert: if pair_inserts[read_pair] < max_insert: good_length += 2 if pair_mapqs[read_pair] > min_mapq: mapq_good += 2 # Which set does this read go into? if pair_mismatch[read_pair] > filter_cutoff: filter_cutoff_good += 2 final_reads.add(read_pair) #write out to new bam file if option selected if write_bam: for read in reads_all[read_pair]: samfile_out.write(read) else: too_long += 2 else: too_short += 2 table = defaultdict(list) table["total reads found"].append(str(total_read_count)) table["average mapped read length"].append(str(mapped_read_lengths)) table["total fasta length"].append(str(fasta_length)) table["expected possible coverage"].append( str(float(total_read_count) * mapped_read_lengths / fasta_length)) table["total paired reads"].append(str(total_read_pairs * 2)) table["total paired reads (%)"].append( str(int(100 * total_read_pairs * 2.0 / total_read_count))) table["total same scaffold mapped paired reads"].append( str(total_mapped_pairs * 2)) table["total same scaffold mapped paired reads (%)"].append( str(int(100 * total_read_pairs * 2.0 / total_read_count))) table["median insert size"].append(str(max_insert / max_insert_relative)) table["paired reads < 50 bp apart"].append(str(too_short)) table["max insert"].append(str(max_insert)) table["paired reads > max insert apart"].append(str(too_long)) table["reads which also pass both pair insert size filters"].append( str(good_length)) table["reads which also pass both pair insert size filters (%)"].append( str(int(100 * float(good_length) / total_read_count))) table["minimum mapq threshold"].append(str(min_mapq)) table["reads which pass minimum mapq threshold"].append(str(mapq_good)) table["reads which pass minimum mapq threshold (%)"].append( str(int(100 * float(mapq_good) / total_read_count))) table['minimum PID'].append(str(filter_cutoff)) table["(final) reads which also pass read pair PID"].append( filter_cutoff_good) table["(final) reads which also pass read pair PID (%)"].append( str(int(100 * float(filter_cutoff_good) / total_read_count))) table["(final) expected coverage"].append( str(float(filter_cutoff_good) * mapped_read_lengths / fasta_length)) Rdb = pd.DataFrame(table) logging.debug("**READ STATSTICS**") logging.debug("total reads found: " + str(total_read_count)) logging.debug("average mapped read length: " + str(mapped_read_lengths)) logging.debug("total fasta length: " + str(fasta_length)) logging.debug( "expected possible coverage: " + str(float(total_read_count) * mapped_read_lengths / fasta_length)) logging.debug("total paired reads: " + str(total_read_pairs * 2) + " (" + str(int(100 * total_read_pairs * 2.0 / total_read_count)) + "%)") logging.debug("total same scaffold mapped paired reads: " + str(total_mapped_pairs * 2) + " (" + str(int(100 * total_mapped_pairs * 2.0 / total_read_count)) + "%)") logging.debug("") logging.debug("median insert size: " + str(max_insert / max_insert_relative)) logging.debug("paired reads < 50 bp apart: " + str(too_short)) logging.debug("paired reads > " + str(max_insert) + " apart: " + str(too_long)) logging.debug("reads which also pass both pair insert size filters: " + str(good_length) + " (" + str(int(100 * float(good_length) / total_read_count)) + "%)") logging.debug("reads which pass minimum mapq threshold of " + str(min_mapq) + ": " + str(mapq_good) + " (" + str(int(100 * float(mapq_good) / total_read_count)) + "%)") logging.debug("(final) reads which also pass read pair PID >" + str(filter_cutoff) + "%: " + str(filter_cutoff_good) + " (" + str(int(100 * float(filter_cutoff_good) / total_read_count)) + "%)") logging.debug( "(final) expected coverage: " + str(float(filter_cutoff_good) * mapped_read_lengths / fasta_length)) ## STEP 3: WRITE DATA IF NEEDED if write_data: f = open(write_data, 'w+') for read_pair in mapped_pairs: f.write(read_pair + "\t" + "\t" + str(pair_inserts[read_pair]) + "\t" + str(pair_mapqs[read_pair]) + "\t" + str(pair_mismatch[read_pair]) + "\n") f.close() ## STEP 4: WRITE NEW BAM IF NEEDED samfile.close() if write_bam: samfile_out.close() logging.info("sorting new bam") pysam.sort("-o", bam.split("/")[-1].split(".")[0] + "_filtered_sort.bam", bam.split("/")[-1].split(".")[0] + "_filtered.bam") os.system('rm ' + bam.split("/")[-1].split(".")[0] + "_filtered.bam") return final_reads, Rdb
if read2 and read2.flag & 0x4: print read if read1 is None and read2 is None: print "Somehow we ended up with a double empty" assert False elif read1 is not None and read2 is not None: print "Somehow we didn't clear out the double-map" assert False elif read2 is None: read1.rnext = -1 read1.pnext = 0 read1.tlen = 0 read1.flag = read1.flag | 0x8 outfile.write(read1) elif read1 is None: read2.rnext = -1 read2.pnext = 0 read2.tlen = 0 read2.flag = read2.flag | 0x8 outfile.write(read2) else: print "How did we get here?" assert False filename = outfile.filename outfile.close() sorted_filename = filename[:filename.index('_unsorted')] print "Sorting into", sorted_filename pysam.sort('-m', "%d" % 3e9, filename, sorted_filename)
def main(inputs, output, bam_file, strand_specific, library, protocol, median_fragment_size, stdev_fragment_size, read_length, reference_genome, annotations, masking, aligner_reference, start_time=int(time.time()), **kwargs): """ Args: inputs (list): list of input files containing the breakpoint pairs output (str): path to the output directory bam_file (str): path the bam file strand_specific (bool): flag to indicate the input bam is using a strand specific protocol median_fragment_size (int): the median fragment size stdev_fragment_size (int): the standard deviation in fragment size read_length (int): read length reference_genome (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genome` annotations (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genes` masking (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_masking_regions` aligner_reference (:class:`~mavis.annotate.file_io.ReferenceFile`): path to the aligner reference file (e.g 2bit file for blat) """ mkdirp(output) # check the files exist early to avoid waiting for errors if protocol == PROTOCOL.TRANS: annotations.load() reference_genome.load() masking.load() validation_settings = {} validation_settings.update(DEFAULTS.items()) validation_settings.update( {k: v for k, v in kwargs.items() if k in DEFAULTS}) validation_settings = MavisNamespace(**validation_settings) raw_evidence_bam = os.path.join(output, 'raw_evidence.bam') contig_bam = os.path.join(output, 'contigs.bam') evidence_bed = os.path.join(output, 'evidence.bed') passed_output_file = os.path.join(output, PASS_FILENAME) passed_bed_file = os.path.join(output, 'validation-passed.bed') failed_output_file = os.path.join(output, 'validation-failed.tab') contig_aligner_fa = os.path.join(output, 'contigs.fa') if validation_settings.aligner == SUPPORTED_ALIGNER.BLAT: contig_aligner_output = os.path.join(output, 'contigs.blat_out.pslx') contig_aligner_log = os.path.join(output, 'contigs.blat.log') elif validation_settings.aligner == SUPPORTED_ALIGNER.BWA_MEM: contig_aligner_output = os.path.join(output, 'contigs.bwa_mem.sam') contig_aligner_log = os.path.join(output, 'contigs.bwa_mem.log') else: raise NotImplementedError('unsupported aligner', validation_settings.aligner) igv_batch_file = os.path.join(output, 'igv.batch') input_bam_cache = BamCache(bam_file, strand_specific) bpps = read_inputs( inputs, add_default={ COLUMNS.cluster_id: None, COLUMNS.stranded: False }, add={ COLUMNS.protocol: protocol, COLUMNS.library: library }, expand_strand=False, expand_orient=True, cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x}) evidence_clusters = [] for bpp in bpps: if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME: try: evidence = GenomeEvidence( bpp.break1, bpp.break2, input_bam_cache, reference_genome.content, opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, stdev_fragment_size=stdev_fragment_size, read_length=read_length, median_fragment_size=median_fragment_size, **dict(validation_settings.items())) evidence_clusters.append(evidence) except ValueError as err: warnings.warn( 'Dropping breakpoint pair ({}) as bad input {}'.format( str(bpp), str(err))) elif bpp.data[COLUMNS.protocol] == PROTOCOL.TRANS: try: evidence = TranscriptomeEvidence( annotations.content, bpp.break1, bpp.break2, input_bam_cache, reference_genome.content, opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, stdev_fragment_size=stdev_fragment_size, read_length=read_length, median_fragment_size=median_fragment_size, **dict(validation_settings.items())) evidence_clusters.append(evidence) except ValueError as err: warnings.warn('Dropping ({}) as bad input {}'.format( str(bpp), str(err))) else: raise ValueError('protocol error', bpp.data[COLUMNS.protocol]) extended_masks = {} for chrom, masks in masking.content.items( ): # extend masking by read length extended_masks[chrom] = [] for mask in masks: extended_masks[chrom].append( BioInterval(chrom, mask.start - read_length, mask.end + read_length, name=mask.name)) evidence_clusters, filtered_evidence_clusters = filter_on_overlap( evidence_clusters, extended_masks) contig_sequences = {} for i, evidence in enumerate(evidence_clusters): LOG() LOG('({} of {})'.format(i + 1, len(evidence_clusters)), 'gathered evidence for:', evidence.cluster_id, '' if COLUMNS.tracking_id not in evidence.data else '(tracking_id: {})'.format(evidence.tracking_id), time_stamp=True) LOG(evidence, time_stamp=False) LOG('possible event type(s):', BreakpointPair.classify(evidence), time_stamp=False) LOG('outer window regions: {}:{}-{} {}:{}-{}'.format( evidence.break1.chr, evidence.outer_window1[0], evidence.outer_window1[1], evidence.break2.chr, evidence.outer_window2[0], evidence.outer_window2[1]), time_stamp=False) LOG('inner window regions: {}:{}-{} {}:{}-{}'.format( evidence.break1.chr, evidence.inner_window1[0], evidence.inner_window1[1], evidence.break2.chr, evidence.inner_window2[0], evidence.inner_window2[1]), time_stamp=False) evidence.load_evidence(log=LOG) LOG('flanking pairs: {};'.format(len(evidence.flanking_pairs)), 'split reads: {}, {};'.format( *[len(a) for a in evidence.split_reads]), 'half-mapped reads: {}, {};'.format( *[len(a) for a in evidence.half_mapped]), 'spanning-reads: {};'.format(len(evidence.spanning_reads)), 'compatible flanking pairs:', len(evidence.compatible_flanking_pairs), time_stamp=False) evidence.assemble_contig(log=LOG) LOG('assembled {} contigs'.format(len(evidence.contigs)), time_stamp=False) for contig in evidence.contigs: name = 'seq-{}'.format( hashlib.md5(contig.seq.encode('utf-8')).hexdigest()) LOG('>', name, '(size={}; reads={:.0f}; coverage={:.2f})'.format( len(contig.seq), contig.remap_score(), contig.remap_coverage()), time_stamp=False) LOG(contig.seq[:140], time_stamp=False) contig_sequences[name] = contig.seq LOG('will output:', contig_aligner_fa, contig_aligner_output) raw_contig_alignments = align_sequences( contig_sequences, input_bam_cache, reference_genome=reference_genome.content, aligner_fa_input_file=contig_aligner_fa, aligner_output_file=contig_aligner_output, clean_files=validation_settings.clean_aligner_files, aligner=kwargs.get('aligner', validation_settings.aligner), aligner_reference=aligner_reference.name[0], aligner_output_log=contig_aligner_log, blat_min_identity=kwargs.get('blat_min_identity', validation_settings.blat_min_identity), blat_limit_top_aln=kwargs.get('blat_limit_top_aln', validation_settings.blat_limit_top_aln), log=LOG) for evidence in evidence_clusters: select_contig_alignments(evidence, raw_contig_alignments) LOG('alignment complete', time_stamp=True) event_calls = [] total_pass = 0 write_bed_file( evidence_bed, itertools.chain.from_iterable( [e.get_bed_repesentation() for e in evidence_clusters])) validation_counts = {} for index, evidence in enumerate(evidence_clusters): LOG() LOG('({} of {}) calling events for: {} {} (tracking_id: {})'.format( index + 1, len(evidence_clusters), evidence.cluster_id, evidence.putative_event_types(), evidence.tracking_id), time_stamp=True) LOG('source:', evidence) calls = [] failure_comment = None try: calls = call_events(evidence) event_calls.extend(calls) except UserWarning as err: LOG('warning: error in calling events', repr(err)) failure_comment = str(err) if not calls: failure_comment = [ 'zero events were called' ] if failure_comment is None else failure_comment evidence.data[COLUMNS.filter_comment] = failure_comment filtered_evidence_clusters.append(evidence) else: total_pass += 1 LOG('called {} event(s)'.format(len(calls)), time_stamp=True) for call in calls: LOG(call) if call.call_method == CALL_METHOD.CONTIG: LOG('\t{} {} [{}] contig_alignment_score: {}, contig_alignment_mq: {} contig_alignment_rank: {}' .format(call.event_type, call.call_method, call.contig_alignment.query_name, round(call.contig_alignment.score(), 2), tuple(call.contig_alignment.mapping_quality()), tuple(call.contig_alignment.alignment_rank()))) LOG('\talignment:', call.contig_alignment.alignment_id()) elif call.contig_alignment: LOG( '\t{} {} alignment:'.format(call.event_type, call.call_method), call.contig_alignment.alignment_id()) else: LOG('\t{} {}'.format(call.event_type, call.call_method), time_stamp=False) validation_counts[call.cluster_id] = validation_counts.get( call.cluster_id, 0) + 1 call.data[COLUMNS.validation_id] = '{}-v{}'.format( call.cluster_id, validation_counts[call.cluster_id]) LOG('\tremapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]' ', flanking pairs: {}{}'.format( 0 if not call.contig else len(call.contig.input_reads), len(call.spanning_reads), len(call.break1_split_read_names()), len(call.break1_split_read_names(tgt=True)), len(call.break2_split_read_names()), len(call.break2_split_read_names(tgt=True)), len(call.linking_split_read_names()), len(call.flanking_pairs), '' if not call.has_compatible else '(' + str(len(call.compatible_flanking_pairs)) + ')')) # write the output validated clusters (split by type and contig) for i, call in enumerate(event_calls): b1_homseq = None b2_homseq = None try: b1_homseq, b2_homseq = call.breakpoint_sequence_homology( reference_genome.content) except AttributeError: pass call.data.update({ COLUMNS.break1_homologous_seq: b1_homseq, COLUMNS.break2_homologous_seq: b2_homseq, }) LOG('{} putative calls resulted in {} events with 1 or more event call'. format(len(evidence_clusters), total_pass), time_stamp=True) output_tabbed_file(event_calls, passed_output_file) output_tabbed_file(filtered_evidence_clusters, failed_output_file) write_bed_file( passed_bed_file, itertools.chain.from_iterable( [e.get_bed_repesentation() for e in event_calls])) if validation_settings.write_evidence_files: with pysam.AlignmentFile(contig_bam, 'wb', template=input_bam_cache.fh) as fh: LOG('writing:', contig_bam, time_stamp=True) for evidence in evidence_clusters: for contig in evidence.contigs: for aln in contig.alignments: aln.read1.cigar = _cigar.convert_for_igv( aln.read1.cigar) fh.write(aln.read1) if aln.read2: aln.read2.cigar = _cigar.convert_for_igv( aln.read2.cigar) fh.write(aln.read2) # write the evidence with pysam.AlignmentFile(raw_evidence_bam, 'wb', template=input_bam_cache.fh) as fh: LOG('writing:', raw_evidence_bam, time_stamp=True) reads = set() for evidence in evidence_clusters: reads.update(evidence.supporting_reads()) for read in reads: read.cigar = _cigar.convert_for_igv(read.cigar) fh.write(read) # now sort the contig bam sort = re.sub(r'.bam$', '.sorted.bam', contig_bam) LOG('sorting the bam file:', contig_bam, time_stamp=True) pysam.sort('-o', sort, contig_bam) contig_bam = sort LOG('indexing the sorted bam:', contig_bam) pysam.index(contig_bam) # then sort the evidence bam file sort = re.sub(r'.bam$', '.sorted.bam', raw_evidence_bam) LOG('sorting the bam file:', raw_evidence_bam, time_stamp=True) pysam.sort('-o', sort, raw_evidence_bam) raw_evidence_bam = sort LOG('indexing the sorted bam:', raw_evidence_bam) pysam.index(raw_evidence_bam) # write the igv batch file with open(igv_batch_file, 'w') as fh: LOG('writing:', igv_batch_file, time_stamp=True) fh.write('load {} name="{}"\n'.format(passed_bed_file, 'passed events')) fh.write('load {} name="{}"\n'.format(contig_bam, 'aligned contigs')) fh.write('load {} name="{}"\n'.format(evidence_bed, 'evidence windows')) fh.write('load {} name="{}"\n'.format(raw_evidence_bam, 'raw evidence')) fh.write('load {} name="{} {} input"\n'.format( bam_file, library, protocol))
def buildNormalizedBAM(infiles, outfile, normalize=True): '''build a normalized BAM file. Infiles are merged and duplicated reads are removed. If *normalize* is set, reads are removed such that all files will have approximately the same number of reads. Note that the duplication here is wrong as there is no sense of strandedness preserved. ''' min_reads = getMinimumMappedReads(glob.glob("*.readstats")) samfiles = [] num_reads = 0 for infile, statsfile in infiles: samfiles.append(pysam.Samfile(infile, "rb")) num_reads += getMappedReads(statsfile) threshold = float(min_reads) / num_reads E.info("%s: min reads: %i, total reads=%i, threshold=%f" % (infiles, min_reads, num_reads, threshold)) pysam_out = pysam.Samfile(outfile, "wb", template=samfiles[0]) ninput, noutput, nduplicates = 0, 0, 0 # iterate over mapped reads last_contig, last_pos = None, None for pysam_in in samfiles: for read in pysam_in.fetch(): ninput += 1 if read.rname == last_contig and read.pos == last_pos: nduplicates += 1 continue if normalize and random.random() <= threshold: pysam_out.write(read) noutput += 1 last_contig, last_pos = read.rname, read.pos pysam_in.close() pysam_out.close() logs = IOTools.openFile(outfile + ".log", "w") logs.write("# min_reads=%i, threshold= %5.2f\n" % (min_reads, threshold)) logs.write("set\tcounts\tpercent\n") logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0)) nwithout_dups = ninput - nduplicates logs.write("duplicates\t%i\t%5.2f%%\n" % (nduplicates, 100.0 * nduplicates / ninput)) logs.write("without duplicates\t%i\t%5.2f%%\n" % (nwithout_dups, 100.0 * nwithout_dups / ninput)) logs.write("target\t%i\t%5.2f%%\n" % (min_reads, 100.0 * min_reads / nwithout_dups)) logs.write("noutput\t%i\t%5.2f%%\n" % (noutput, 100.0 * noutput / nwithout_dups)) logs.close() # if more than one samfile: sort if len(samfiles) > 1: tmpfilename = P.getTempFilename(".") pysam.sort(outfile, tmpfilename) shutil.move(tmpfilename + ".bam", outfile) os.unlink(tmpfilename) pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, min_reads))
def picVal(opts=None): """ called with sam so no need to convert """ assert opts <> None killme = string.punctuation + string.whitespace trantab = string.maketrans(killme, '_' * len(killme)) title = opts.title.translate(trantab) tempout = os.path.join(opts.output_dir, 'rgPicardValidate.out') temptab = os.path.join(opts.output_dir, 'rgPicardValidate.xls') opts.log_file = opts.log or os.path.join(opts.output_dir, 'rgPicardValidate_%s.log' % title) # Create output folder and save our R script in there. stf = open(opts.log_file, 'w') sortedfile = None if verbose: print '# opts.ignore', opts.ignore, ' opts.sortme=', opts.sortme if opts.sortme: fd, sortedfile = tempfile.mkstemp(suffix='rgcleansam.sorted.bam') if opts.datatype == 'sam': # need to work with a bam tempbam = samToBam(opts.input, opts.outdir) pysam.sort(tempbam, sortedfile) else: # is already bam pysam.sort(opts.input, sortedfile) cl = [ 'java -Xmx', opts.maxjheap, ' -jar ', opts.jar, ' O=', tempout, ' TMP_DIR=', opts.tmp_dir ] if verbose: print '# cl so far', cl if opts.sortme: cl.append(' I=%s' % sortedfile) else: cl.append(' I=%s' % opts.input) if opts.maxoutput == '0': opts.maxoutput = '65535' cl.append(' MAX_OUTPUT=%s' % opts.maxoutput) if opts.ignore[0] <> 'None': # picard error values to ignore cl += [' IGNORE=%s' % x for x in opts.ignore if x <> 'None'] if opts.bisulphite.lower() <> 'false': cl.append(' IS_BISULFITE_SEQUENCED=true') if opts.refseq <> '': cl += [ ' R=%s' % opts.refseq, ] s1 = ' '.join(['"%s"' % x for x in cl]) s = '## rgPicardValidate.py about to Popen:\n%s\n' % s1 stf.write(s) if verbose: print s pefilename = os.path.join(opts.output_dir, 'rgPicardValidate_%s.errors' % title) picerrors = open(pefilename, 'w') process = Popen(''.join(cl), shell=True, stderr=picerrors, stdout=picerrors, cwd=opts.output_dir) return_value = process.wait() picerrors.close() pe = open(pefilename, 'r').readlines() stf.write('## got %d rows - first few =%s\n' % (len(pe), '\n'.join(pe[:5]))) if opts.dryrun <> 'dryrun': # want to run cleansam if opts.dryrun == 'sam': outformat = 'sam' newsam = opts.sam elif opts.dryrun == 'bam': outformat = 'bam' newsam = opts.bam cleanSam(insam=opts.input, newsam=newsam, picardErrors=pe, outformat=outformat, sortme=opts.sortme) stf.close() fixPicardOutputs(tempout=tempout, output_dir=opts.output_dir, log_file=opts.log_file, html_output=opts.html_output, progname=progname, cl=cl, transpose=False) if opts.sortme: os.unlink(sortedfile) if opts.datatype == 'sam': # was converted os.unlink(tempbam) # temporary
def filter_bam_multihits(filename, max_hits, tmp_dir, read_tagger, omit_detail=True): """Pre-processing function for cleaning up the input bam file. Args: Returns: """ logger.info('Filtering input bam..') in_bam = pysam.Samfile(filename,'rb') # unique read bam ubam_fn = os.path.join(tmp_dir, 'unique.bam') sorted_ubam_fn = os.path.join(tmp_dir, 'unique.sorted.bam') ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam) unique_counter = 0 # multi-read bam mbam_fn = os.path.join(tmp_dir, 'multi.bam') sorted_mbam_fn = os.path.join(tmp_dir, 'multi.sorted.bam') mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam) mread_set = set() # splitting unique and multi- reads # and add the read taggers we need for read in tqdm(in_bam): read_tag = read_tagger(read) ## skip reads with unassigned tagger if read_tag==-1: continue read.tags += [('RT', read_tag)] ## add the tag ## omit the details in read sequence and quality ## recommended for larger bam because this ## can save some memory/storage for large bams if omit_detail: read.query_sequence = '*' read.query_qualities = [0] if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): try: if read.opt("NH") < max_hits: mbam.write(read) mread_set.add(read.qname) except KeyError: #print read raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary)) else: ubam.write(read) unique_counter += 1 in_bam.close() ubam.close() mbam.close() # sorting pysam.sort('-o', sorted_ubam_fn, ubam_fn) os.remove(ubam_fn) pysam.sort('-o', sorted_mbam_fn, mbam_fn) os.remove(mbam_fn) pysam.index(sorted_ubam_fn) pysam.index(sorted_mbam_fn) # log the statistics multi_counter = len(mread_set) logger.info( 'Unique reads = %s; ' % unique_counter + \ 'Multi reads = %s (%.2f %%)' % \ ( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 ) ) return
okread.is_paired = True okread.is_read1 = is_first okread.is_read2 = not is_first sout.write(okread) os.remove(mapped5) os.remove(mapped3) os.remove(mappedUs) # Sorting the resulting file. We do it this way because the original # file wasn't guaranteed to be sorted. To try to merge the three files # (split, 5' and 3'), we'd need to assume some sorting order and I'm # not willing to do that. Sorting afterward enforces the 'samtools' # name ordering over anything that might have been there originally. bsorted = os.path.join(tmpdir, "sorted.bam") pysam.sort("-o", bsorted, "-n", outbam) os.rename(bsorted, outbam) #################################################################################################### # Stitching two files together to reform a single BAM file. cmd = ["-n", "-f", args.output] + allnames pysam.merge(*cmd) for x in allnames: os.remove(x) # Mopping up. import shutil shutil.rmtree(tmpdir) dumpf.close()
def main(): parser = ArgumentParser() parser.add_argument('--input', dest='in_bam', required=True, help='Path to unaligned, paired-end, bam file.') parser.add_argument('--taglen', dest='tag_len', type=int, default=12, help='Length in bases of the duplex tag sequence.[12]') parser.add_argument( '--spacerlen', dest='spcr_len', type=int, default=5, help= 'Length in bases of the spacer sequence between duplex tag and the start of target DNA. [5]' ) parser.add_argument("--tagstats", dest='tagstats', action="store_true", help="output tagstats file") parser.add_argument( '--minmem', dest='minmem', type=int, default=3, help="Minimum number of reads allowed to comprise a consensus. [3]") parser.add_argument( '--maxmem', dest='maxmem', type=int, default=200, help="Maximum number of reads allowed to comprise a consensus. [200]") parser.add_argument( '--cutoff', dest='cutoff', type=float, default=.7, help= "Percentage of nucleotides at a given position in a read that must be identical in order " "for a consensus to be called at that position. [0.7]") parser.add_argument( '--Ncutoff', dest='Ncutoff', type=float, default=1, help= "With --filt 'n', maximum fraction of Ns allowed in a consensus [1.0]") parser.add_argument('--write-sscs', dest='write_sscs', action="store_true", help="Print the SSCS reads to file in FASTQ format") parser.add_argument('--without-dcs', dest='without_dcs', action="store_true", help="Don't print final DCS reads") parser.add_argument( "--rep_filt", action="store", type=int, dest='rep_filt', help="Remove tags with homomeric runs of nucleotides of length x. [9]", default=9) parser.add_argument('--prefix', dest='prefix', type=str, required=True, help="Sample name to uniquely identify samples") o = parser.parse_args() dummy_header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': 1575, 'SN': 'chr1' }, { 'LN': 1584, 'SN': 'chr2' }] } in_bam_file = pysam.AlignmentFile(o.in_bam, "rb", check_sq=False) temp_bam = pysam.AlignmentFile(o.prefix + ".temp.bam", 'wb', header=dummy_header) paired_end_count = 1 if o.write_sscs is True: read1_sscs_fq_file = gzip.open(o.prefix + '_read1_sscs.fq.gz', 'wb') read2_sscs_fq_file = gzip.open(o.prefix + '_read2_sscs.fq.gz', 'wb') if o.without_dcs is False: read1_dcs_fq_file = gzip.open(o.prefix + '_read1_dcs.fq.gz', 'wb') read2_dcs_fq_file = gzip.open(o.prefix + '_read2_dcs.fq.gz', 'wb') '''This block of code takes an unaligned bam file, extracts the tag sequences from the reads, and converts them to to "ab/ba" format where 'a' and 'b' are the tag sequences from Read 1 and Read 2, respectively. Conversion occurs by putting the tag with the "lesser" value in front of the tag with the "higher" value. The original tag orientation is denoted by appending #ab or #ba to the end of the tag. After conversion, the resulting temporary bam file is then sorted by read name.''' print "Parsing tags..." for line in in_bam_file.fetch(until_eof=True): if paired_end_count % 2 == 1: temp_read1_entry = pysam.AlignedSegment() temp_read1_entry.query_name = line.query_name temp_read1_entry.query_sequence = line.query_alignment_sequence temp_read1_entry.query_qualities = line.query_alignment_qualities if paired_end_count % 2 == 0: temp_bam_entry = pysam.AlignedSegment() if temp_read1_entry.query_sequence[:o. tag_len] > line.query_alignment_sequence[:o . tag_len]: temp_bam_entry.query_name = temp_read1_entry.query_sequence[:o.tag_len] + \ line.query_alignment_sequence[:o.tag_len] + '#ab' elif temp_read1_entry.query_sequence[:o. tag_len] < line.query_alignment_sequence[:o . tag_len]: temp_bam_entry.query_name = line.query_alignment_sequence[:o.tag_len] + \ temp_read1_entry.query_sequence[:o.tag_len] + '#ba' elif temp_read1_entry.query_sequence[:o. tag_len] == line.query_alignment_sequence[: o . tag_len]: paired_end_count += 1 continue # Write entries for Read 1 temp_bam_entry.query_name += ":1" temp_bam_entry.query_sequence = temp_read1_entry.query_sequence[ o.tag_len + o.spcr_len:] temp_bam_entry.query_qualities = temp_read1_entry.query_qualities[ o.tag_len + o.spcr_len:] temp_bam_entry.set_tag('X?', temp_read1_entry.query_name, 'Z') temp_bam.write(temp_bam_entry) # Write entries for Read 2 temp_bam_entry.query_name = temp_bam_entry.query_name.replace( '1', '2') temp_bam_entry.query_sequence = line.query_sequence[o.tag_len + o.spcr_len:] temp_bam_entry.query_qualities = line.query_qualities[o.tag_len + o.spcr_len:] temp_bam_entry.set_tag('X?', line.query_name, 'Z') temp_bam.write(temp_bam_entry) paired_end_count += 1 in_bam_file.close() temp_bam.close() print "Sorting reads on tag sequence..." pysam.sort("-n", o.prefix + ".temp.bam", "-o", o.prefix + ".temp.sort.bam") # Sort by read name, which will be the # tag sequence in this case. os.remove(o.prefix + ".temp.bam") '''Extracting tags and sorting based on tag sequence is complete. This block of code now performs the consensus calling on the tag families in the temporary name sorted bam file.''' seq_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []} qual_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []} fam_size_x_axis = [] fam_size_y_axis = [] read1_dcs_len = 0 read2_dcs_len = 0 in_bam_file = pysam.AlignmentFile(o.prefix + '.temp.sort.bam', "rb", check_sq=False) first_line = in_bam_file.next() FinalValue = pysam.AlignedSegment() FinalValue.query_name = "FinalValue#ab:1" seq_dict[first_line.query_name.split('#')[1]].append( first_line.query_sequence) qual_dict[first_line.query_name.split('#')[1]].append( list(first_line.query_qualities)) tag_count_dict = defaultdict(lambda: 0) print "Creating consensus reads..." for line in iteratorWrapper(in_bam_file.fetch(until_eof=True), FinalValue): tag, subtag_order = first_line.query_name.split( '#')[0], first_line.query_name.split('#')[1] if line.query_name.split('#')[0] == tag: seq_dict[line.query_name.split('#')[1]].append(line.query_sequence) qual_dict[line.query_name.split('#')[1]].append( list(line.query_qualities)) else: if len(seq_dict['ab:1']) != len(seq_dict['ab:2']) or len( seq_dict['ba:1']) != len(seq_dict['ba:2']): raise Exception( 'ERROR: Read counts for Read1 and Read 2 do not match for tag %s' % tag) for tag_subtype in seq_dict.keys(): if len(seq_dict[tag_subtype]) > 0: tag_count_dict[len(seq_dict[tag_subtype])] += 1 if len(seq_dict[tag_subtype]) < o.minmem: seq_dict[tag_subtype] = [] qual_dict[tag_subtype] = [] elif o.minmem <= len( seq_dict[tag_subtype] ) <= o.maxmem: # Tag types w/o reads should not be submitted # as long as minmem is > 0 seq_dict[tag_subtype] = [ consensus_caller(seq_dict[tag_subtype], o.cutoff, tag, True), str(len(seq_dict[tag_subtype])) ] qual_dict[tag_subtype] = qual_calc(qual_dict[tag_subtype]) elif len(seq_dict[tag_subtype]) > o.maxmem: seq_dict[tag_subtype] = [ consensus_caller(seq_dict[tag_subtype][:o.maxmem], o.cutoff, tag, True), str(len(seq_dict[tag_subtype])) ] qual_dict[tag_subtype] = qual_calc(qual_dict[tag_subtype]) if o.write_sscs is True: if len(seq_dict['ab:1']) != 0 and len(seq_dict['ab:2']) != 0: corrected_qual_score = map(lambda x: x if x < 41 else 41, qual_dict['ab:1']) read1_sscs_fq_file.write( '@%s#ab/1\n%s\n+%s\n%s\n' % (tag, seq_dict['ab:1'][0], seq_dict['ab:1'][1], "".join(chr(x + 33) for x in corrected_qual_score))) corrected_qual_score = map(lambda x: x if x < 41 else 41, qual_dict['ab:2']) read2_sscs_fq_file.write( '@%s#ab/2\n%s\n+%s\n%s\n' % (tag, seq_dict['ab:2'][0], seq_dict['ab:2'][1], "".join(chr(x + 33) for x in corrected_qual_score))) if len(seq_dict['ba:1']) != 0 and len(seq_dict['ba:2']) != 0: corrected_qual_score = map(lambda x: x if x < 41 else 41, qual_dict['ba:1']) read1_sscs_fq_file.write( '@%s#ba/1\n%s\n+%s\n%s\n' % (tag, seq_dict['ba:1'][0], seq_dict['ba:1'][1], "".join(chr(x + 33) for x in corrected_qual_score))) corrected_qual_score = map(lambda x: x if x < 41 else 41, qual_dict['ba:1']) read2_sscs_fq_file.write( '@%s#ba/2\n%s\n+%s\n%s\n' % (tag, seq_dict['ba:2'][0], seq_dict['ba:2'][1], "".join(chr(x + 33) for x in corrected_qual_score))) if o.without_dcs is False: if len(seq_dict['ab:1']) != 0 and len(seq_dict['ba:2']) != 0: dcs_read_1 = [ consensus_caller( [seq_dict['ab:1'][0], seq_dict['ba:2'][0]], 1, tag, False), seq_dict['ab:1'][1], seq_dict['ba:2'][1] ] dcs_read_1_qual = map( lambda x: x if x < 41 else 41, qual_calc([qual_dict['ab:1'], qual_dict['ba:2']])) read1_dcs_len = len(dcs_read_1) fam_size_x_axis.append(int(seq_dict['ab:1'][1])) fam_size_y_axis.append(int(seq_dict['ba:2'][1])) if dcs_read_1.count('N') / float( read1_dcs_len) > o.Ncutoff: dcs_read_1 = 'N' * read1_dcs_len dcs_read_1_qual = '!' * read1_dcs_len if len(seq_dict['ba:1']) != 0 and len(seq_dict['ab:2']) != 0: dcs_read_2 = [ consensus_caller( [seq_dict['ba:1'][0], seq_dict['ab:2'][0]], 1, tag, False), seq_dict['ba:1'][1], seq_dict['ab:2'][1] ] dcs_read_2_qual = map( lambda x: x if x < 41 else 41, qual_calc([qual_dict['ba:1'], qual_dict['ab:2']])) read2_dcs_len = len(dcs_read_2) if dcs_read_2.count('N') / float( read1_dcs_len) > o.Ncutoff: dcs_read_2 = 'N' * read1_dcs_len dcs_read_2_qual = '!' * read2_dcs_len if read1_dcs_len != 0 and read2_dcs_len != 0 and tag.count('N') == 0 and \ 'A' * o.rep_filt not in tag and 'C' * o.rep_filt not in tag and \ 'G' * o.rep_filt not in tag and 'T' * o.rep_filt not in tag: read1_dcs_fq_file.write( '@%s/1\n%s\n+%s:%s\n%s\n' % (tag, dcs_read_1[0], dcs_read_1[1], dcs_read_1[2], "".join(chr(x + 33) for x in dcs_read_1_qual))) read2_dcs_fq_file.write( '@%s/2\n%s\n+%s:%s\n%s\n' % (tag, dcs_read_2[0], dcs_read_2[1], dcs_read_2[2], "".join(chr(x + 33) for x in dcs_read_2_qual))) if line != FinalValue: # reset conditions for next tag family first_line = line seq_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []} qual_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []} read1_dcs_len = 0 read2_dcs_len = 0 dcs_read_1 = '' dcs_read_2 = '' seq_dict[line.query_name.split('#')[1]].append( line.query_sequence ) # Now add initializing data for new tag qual_dict[first_line.query_name.split('#')[1]].append( list(first_line.query_qualities)) if o.write_sscs is True: read1_sscs_fq_file.close() read2_sscs_fq_file.close() if o.without_dcs is False: read1_dcs_fq_file.close() read2_dcs_fq_file.close() # Try to plot the tag family sizes if o.tagstats is True: tag_stats_file = open(o.prefix + ".tagstats.txt", 'w') x_value = [] y_value = [] total_reads = sum([ tag_count_dict[tag_family_size] * tag_family_size for tag_family_size in tag_count_dict.keys() ]) for tag_family_size in sorted(tag_count_dict.keys()): fraction = (tag_count_dict[tag_family_size] * tag_family_size) / float(total_reads) tag_stats_file.write( '%d\t%d\t%f\n' % (tag_family_size, tag_count_dict[tag_family_size], fraction)) x_value.append(tag_family_size) y_value.append(fraction) try: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(1) plt.bar(x_value, y_value) plt.xlabel('Family Size') plt.ylabel('Proportion of Total Reads') plt.savefig(o.prefix + 'family_size.png', bbox_inches='tight') plt.figure(2) plt.scatter(fam_size_x_axis, fam_size_y_axis, alpha=.1) plt.xlabel('Family size for AB:1') plt.ylabel('Family size for BA:2') plt.xlim(0, max(fam_size_x_axis)) plt.ylim(0, max(fam_size_y_axis)) plt.savefig(o.prefix + 'fam_size_relation.png', bbox_inches='tight') except ImportError: sys.stderr.write( 'matplotlib not present. Only tagstats file will be generated.' ) tag_stats_file.close()
def make_gnashyfile(bcfilename, outpath, genome): #make chromosome list if genome == 'hs': chr_list = [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY' ] chr_dict = { 'chr1': 1, 'chr2': 2, 'chr3': 3, 'chr4': 4, 'chr5': 5, 'chr6': 6, 'chr7': 7, 'chr8': 8, 'chr9': 9, 'chr10': 10, 'chr11': 11, 'chr12': 12, 'chr13': 13, 'chr14': 14, 'chr15': 15, 'chr16': 16, 'chr17': 17, 'chr18': 18, 'chr19': 19, 'chr20': 20, 'chr21': 21, 'chr22': 22, 'chrX': 23, 'chrY': 24 } print "making human gnashyfile" else: chr_list = [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY' ] chr_dict = { 'chr1': 1, 'chr2': 2, 'chr3': 3, 'chr4': 4, 'chr5': 5, 'chr6': 6, 'chr7': 7, 'chr8': 8, 'chr9': 9, 'chr10': 10, 'chr11': 11, 'chr12': 12, 'chr13': 13, 'chr14': 14, 'chr15': 15, 'chr16': 16, 'chr17': 17, 'chr18': 18, 'chr19': 19, 'chrX': 20, 'chrY': 21 } print "making mouse gnashyfile" #read in experiments and barcodes. Key = (primer barcode, Transposon barode) #Value = expt name barcode_dict = read_barcode_file(bcfilename) #initialize quality control dictionary qc_dict = {} #LOOP THROUGH EXPERIMENTS #loop through experiments and make a separate gnashy file for each for expt in list(set(barcode_dict.values())): #for each experiment, there will be multiple bam files. Loop through all of them #open output gnashyfile print "Analyzing " + expt output_filename = outpath + expt + ".gnashy" output_handle = file(output_filename, 'w') #LOOP THROUGH BAM FILES CORRESPONDING TO 1 experiment for key in barcode_dict.keys( ): #this could be made more efficient, but its more clear this way if barcode_dict[key] == expt: primerBC = key[0] transposonBC = key[1] basename = outpath + expt + "_" + primerBC + "_" + transposonBC sbamFilename = basename + ".sorted" pysam.sort(basename + ".bam", sbamFilename) #sort and index bamfile sbamFilename = sbamFilename + ".bam" pysam.index(sbamFilename) print sbamFilename #inialize gnashy dictionary gnashy_dict = {} #make AlignmentFile object current_bamfile = pysam.AlignmentFile(sbamFilename, "rb") #loop through the chromosomes and pileup start sites for chr in chr_list: aligned_reads_group = current_bamfile.fetch(chr) #now loop through each read and pile up start sites for aread in aligned_reads_group: #is the read a reverse read? if aread.is_reverse: #does it align to a ttaa? if (aread.query_sequence[-4:] == 'TTAA' or aread.query_sequence[-4:] == 'ttaa'): #if so, get position and update dictionary pos = aread.get_reference_positions()[-1] if (chr, pos) in gnashy_dict: gnashy_dict[(chr, pos)] += 1 else: gnashy_dict[(chr, pos)] = 1 else: #forward read #does it align to a ttaa? if (aread.query_sequence[0:4] == 'TTAA' or aread.query_sequence[0:4] == 'ttaa'): #if so, get position and update dicitonary pos = aread.get_reference_positions()[0] if (chr, pos) in gnashy_dict: gnashy_dict[(chr, pos)] += 1 else: gnashy_dict[(chr, pos)] = 1 #output dictionary to gnashy file for key in gnashy_dict: output_handle.write( "%s\t%s\t%s\n" % (chr_dict[key[0]], key[1], gnashy_dict[key])) output_handle.close() #OPEN GNASHY FILE AND SORT BY CHR THEN POS qc_dict[expt] = sort_gnashy_file(output_filename) #after all experiments have been analyzed, print out qc qc_handle = file(outpath + "gnashyQC.txt", 'w') for key in qc_dict: qc_handle.write("%s\t%s\t%s\n" % (key, qc_dict[key][0], qc_dict[key][1])) qc_handle.close()
def testBlat(blc): if blc.count('1') > blc.count('0'): return 1 return 0 ####### script_time = time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time())) sys.stderr.write("Script time --> START: %s\n" % (script_time)) sys.stderr.write("Analysis ID: %s\n" % (pid)) if not os.path.exists(bamfile): usage() sys.exit('BAM file %s not found.' % (bamfile)) if sortbam: sys.stderr.write('Sorting BAM file.\n') pysam.sort(bamfile, 'sorted_%s' % (pid)) os.rename(bamfile, bamfile + '_old') os.rename('sorted_%s.bam' % (pid), bamfile) sys.stderr.write('Indexing BAM file.\n') pysam.index(bamfile) if not os.path.exists(bamfile + '.bai') and not sortbam: sys.stderr.write('Indexing BAM file.\n') pysam.index(bamfile) if not os.path.exists(fastafile): usage() sys.exit('Fasta file %s not found.' % (fastafile)) if not os.path.exists(fastafile + '.fai'): sys.stderr.write('Indexing Fasta file.\n') pysam.faidx(fastafile) if not os.path.exists(kfile): sys.exit('File containing RNA editing positions not found.')
def sort_bam(self, bamfile, outprefix): ''' samtools sort ''' pysam.sort(bamfile, outprefix) return outprefix + ".bam"
def get_consensus_report(name, sam_path, ref_path, is_circular, coverage_threshold=0, report_out_dir=None, tmp_files_dir=None): basename = os.path.basename(sam_path) file_name, ext = os.path.splitext(basename) out_dir = tmp_files_dir keep_tmp_files = tmp_files_dir is not None if not keep_tmp_files: out_dir = tempfile.mkdtemp() os.makedirs(out_dir, exist_ok=True) tmp_sam_path = os.path.join(out_dir, file_name + '_tmp.sam') tmp_bam_path = os.path.join(out_dir, file_name + '_tmp.bam') bam_path = os.path.join(out_dir, file_name + '.bam') mpileup_path = bam_path + '.bam.mpilup' logging.info("Split long aligments") split_alignments_in_sam(sam_path, tmp_sam_path) logging.info("Converting sam to bam") pysam.view('-S', tmp_sam_path, '-b', '-o', tmp_bam_path, catch_stdout=False) logging.info("Sorting bam file") pysam.sort(tmp_bam_path, '-o', bam_path, catch_stdout=False) logging.info("Creating bam index") pysam.index(bam_path, '-b') logging.info("Creating mpileup") mpileup_flags = ['-A', '-B', '-Q', '0'] if is_circular: # use secondary aligments as well mpileup_flags.extend(['--ff', '0']) pysam.mpileup(*mpileup_flags, '-f', ref_path, bam_path, '-o', mpileup_path, catch_stdout=False) logging.info("Generating consensus and report") report = process_mpileup(name, sam_path, ref_path, mpileup_path, coverage_threshold, report_out_dir) if not keep_tmp_files: logging.info("Cleaning tmp files") shutil.rmtree(out_dir) return report
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = U.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) group = U.OptionGroup(parser, "group-specific options") group.add_option( "--group-out", dest="tsv", type="string", help="Outfile name for file mapping read id to read group", default=None) group.add_option( "--output-bam", dest="output_bam", action="store_true", default=False, help=("output a bam file with read groups tagged using the UG tag" "[default=%default]")) group.add_option( "--output-unmapped", dest="output_unmapped", action="store_true", default=False, help=("Retain all unmapped reads in output[default=%default]")) parser.add_option("--umi-group-tag", dest="umi_group_tag", type="string", help="tag for the outputted umi group", default='BX') parser.add_option_group(group) # add common options (-h/--help, ...) and parse command line (options, args) = U.Start(parser, argv=argv) U.validateSamOptions(options) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: if options.no_sort_output: out_name = options.stdout.name else: out_name = U.getTempFilename() sorted_out_name = options.stdout.name options.stdout.close() assert options.output_bam, ( "To output a bam you must include --output-bam option") else: if options.no_sort_output: out_name = "-" else: out_name = U.getTempFilename() sorted_out_name = "-" if not options.no_sort_output: # need to determine the output format for sort if options.out_sam: sort_format = "sam" else: sort_format = "bam" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "wh" else: out_mode = "wb" infile = pysam.Samfile(in_name, in_mode) if options.output_bam: outfile = pysam.Samfile(out_name, out_mode, template=infile) else: outfile = None if options.tsv: mapping_outfile = U.openFile(options.tsv, "w") mapping_outfile.write("%s\n" % "\t".join([ "read_id", "contig", "position", "gene", "umi", "umi_count", "final_umi", "final_umi_count", "unique_id" ])) nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0 gene_tag = options.gene_tag metacontig2contig = None if options.chrom: inreads = infile.fetch(reference=options.chrom) else: if options.per_gene and options.gene_transcript_map: metacontig2contig = umi_methods.getMetaContig2contig( infile, options.gene_transcript_map) metatag = "MC" inreads = umi_methods.metafetcher(infile, metacontig2contig, metatag) gene_tag = metatag else: inreads = infile.fetch(until_eof=options.output_unmapped) bundle_iterator = umi_methods.get_bundles( options, all_reads=True, return_read2=True, return_unmapped=options.output_unmapped, metacontig_contig=metacontig2contig) for bundle, key, status in bundle_iterator(inreads): # write out read2s and unmapped (if these options are set) if status == 'single_read': # bundle is just a single read here nInput += 1 if outfile: outfile.write(bundle) nOutput += 1 continue umis = bundle.keys() counts = {umi: bundle[umi]["count"] for umi in umis} nInput += sum(counts.values()) while nOutput >= output_reads + 10000: output_reads += 10000 U.info("Written out %i reads" % output_reads) while nInput >= input_reads + 1000000: input_reads += 1000000 U.info("Parsed %i input reads" % input_reads) # set up UMIClusterer functor with methods specific to # specified options.method processor = network.UMIClusterer(options.method) # group the umis groups = processor(umis, counts, threshold=options.threshold) for umi_group in groups: top_umi = umi_group[0] group_count = sum(counts[umi] for umi in umi_group) for umi in umi_group: reads = bundle[umi]['read'] for read in reads: if outfile: # Add the 'UG' tag to the read read.tags += [('UG', unique_id)] read.tags += [(options.umi_group_tag, top_umi)] outfile.write(read) if options.tsv: if options.per_gene: gene = read.get_tag(gene_tag) else: gene = "NA" mapping_outfile.write("%s\n" % "\t".join( map(str, (read.query_name, read.reference_name, umi_methods.get_read_position( read, options.soft_clip_threshold)[1], gene, umi.decode(), counts[umi], top_umi.decode(), group_count, unique_id)))) nOutput += 1 unique_id += 1 if outfile: outfile.close() if not options.no_sort_output: # sort the output pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name) os.unlink(out_name) # delete the tempfile if options.tsv: mapping_outfile.close() # write footer and output benchmark information. U.info("Reads: %s" % ", ".join([ "%s: %s" % (x[0], x[1]) for x in bundle_iterator.read_events.most_common() ])) U.info("Number of reads out: %i, Number of groups: %i" % (nOutput, unique_id)) U.Stop()