def makeAggregate(cells, directory, suffix, output): """ Create aggregate sample. Make an aggregate bam file from a list of cells, sorts and indexes the file for easy use in IGV. Suffix is required to prevent non 0-padded numbers matching the wrong files. Return final file name. Parameters ---------- cells : list List of cell names to create aggregate from. directory : string Directory path with the bam files from each cell. suffix : string String to match the end of the bam file, use to add file extension and to anchor the extension after file numbers - this will prevent cell_4 matching cell_4*. output : string String containing output file location. """ from glob import glob cells = set(cells) fileList = [] for cell in cells: fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0]) pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False) pysam.sort(output + ".bam", output + ".sorted", catch_stdout=False) pysam.index(output + ".sorted.bam", catch_stdout=False) return output + ".sorted.bam"
def main(): p = argparse.ArgumentParser() p.add_argument('input') p.add_argument('output') p.add_argument('-i', '--min-identity', default=0.90, help="""Remove reads which match reference with < [value] [default: %(default)s]""", type=float) p.add_argument('-b', '--blast-contaminants', action='append') a = p.parse_args() logging.basicConfig(level=logging.INFO, format='[%(name)s] %(message)s') logger = logging.getLogger('low_identity') blast_contaminants = set() for f in a.blast_contaminants: with open(f) as fp: blast_contaminants |= set(parse_blast_contaminants(fp)) dropped = 0 processed = 0 with contextlib.closing(pysam.Samfile(a.input, 'rb')) as input_bam: with contextlib.closing(pysam.Samfile(a.output, 'wb', template=input_bam)) as output_bam: for read in input_bam: processed += 1 if read.qname in blast_contaminants: dropped += 1 continue pct_id = 1.0 - read.opt('NM') / read.alen if pct_id < a.min_identity: dropped += 1 else: output_bam.write(read) logger.info('Removed %d/%d [%0.2f%%]', dropped, processed, dropped / processed * 100) pysam.index(a.output) logger.info("Indexed.")
def processFiles(seqfile,threshold,width): # Need to keep this dictionary up-to-date with references you expect to see gene_pos = {'1b_Con1_full_reference_seq':{'ns5b':{'nterm':7599,'cterm':9371,'seq':'TCGATGTCCTACACATGGACAGGCGCCCTGATCACGCCATGCGCTGCGGAGGAAACCAAGCTGCCCATCAATGCACTGAGCAACTCTTTGCTCCGTCACCACAACTTGGTCTATGCTACAACATCTCGCAGCGCAAGCCTGCGGCAGAAGAAGGTCACCTTTGACAGACTGCAGGTCCTGGACGACCACTACCGGGACGTGCTCAAGGAGATGAAGGCGAAGGCGTCCACAGTTAAGGCTAAACTTCTATCCGTGGAGGAAGCCTGTAAGCTGACGCCCCCACATTCGGCCAGATCTAAATTTGGCTATGGGGCAAAGGACGTCCGGAACCTATCCAGCAAGGCCGTTAACCACATCCGCTCCGTGTGGAAGGACTTGCTGGAAGACACTGAGACACCAATTGACACCACCATCATGGCAAAAAATGAGGTTTTCTGCGTCCAACCAGAGAAGGGGGGCCGCAAGCCAGCTCGCCTTATCGTATTCCCAGATTTGGGGGTTCGTGTGTGCGAGAAAATGGCCCTTTACGATGTGGTCTCCACCCTCCCTCAGGCCGTGATGGGCTCTTCATACGGATTCCAATACTCTCCTGGACAGCGGGTCGAGTTCCTGGTGAATGCCTGGAAAGCGAAGAAATGCCCTATGGGCTTCGCATATGACACCCGCTGTTTTGACTCAACGGTCACTGAGAATGACATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAAGCCAGACAGGCCATAAGGTCGCTCACAGAGCGGCTTTACATCGGGGGCCCCCTGACTAATTCTAAAGGGCAGAACTGCGGCTATCGCCGGTGCCGCGCGAGCGGTGTACTGACGACCAGCTGCGGTAATACCCTCACATGTTACTTGAAGGCCGCTGCGGCCTGTCGAGCTGCGAAGCTCCAGGACTGCACGATGCTCGTATGCGGAGACGACCTTGTCGTTATCTGTGAAAGCGCGGGGACCCAAGAGGACGAGGCGAGCCTACGGGCCTTCACGGAGGCTATGACTAGATACTCTGCCCCCCCTGGGGACCCGCCCAAACCAGAATACGACTTGGAGTTGATAACATCATGCTCCTCCAATGTGTCAGTCGCGCACGATGCATCTGGCAAAAGGGTGTACTATCTCACCCGTGACCCCACCACCCCCCTTGCGCGGGCTGCGTGGGAGACAGCTAGACACACTCCAGTCAATTCCTGGCTAGGCAACATCATCATGTATGCGCCCACCTTGTGGGCAAGGATGATCCTGATGACTCATTTCTTCTCCATCCTTCTAGCTCAGGAACAACTTGAAAAAGCCCTAGATTGTCAGATCTACGGGGCCTGTTACTCCATTGAGCCACTTGACCTACCTCAGATCATTCAACGACTCCATGGCCTTAGCGCATTTTCACTCCATAGTTACTCTCCAGGTGAGATCAATAGGGTGGCTTCATGCCTCAGGAAACTTGGGGTACCGCCCTTGCGAGTCTGGAGACATCGGGCCAGAAGTGTCCGCGCTAGGCTACTGTCCCAGGGGGGGAGGGCTGCCACTTGTGGCAAGTACCTCTTCAACTGGGCAGTAAGGACCAAGCTCAAACTCACTCCAATCCCGGCTGCGTCCCAGTTGGATTTATCCAGCTGGTTCGTTGCTGGTTACAGCGGGGGAGACATATATCACAGCCTGTCTCGTGCCCGACCCCGCTGGTTCATGTGGTGCCTACTCCTACTTTCTGTAGGGGTAGGCATCTATCTACTCCCCAACCGA'}}, '1a_H77_full_reference_seq':{'ns5b':{'nterm':7602,'cterm':9374, 'seq':'TCAATGTCTTATTCCTGGACAGGCGCACTCGTCACCCCGTGCGCTGCGGAAGAACAAAAACTGCCCATCAACGCACTGAGCAACTCGTTGCTACGCCATCACAATCTGGTGTATTCCACCACTTCACGCAGTGCTTGCCAAAGGCAGAAGAAAGTCACATTTGACAGACTGCAAGTTCTGGACAGCCATTACCAGGACGTGCTCAAGGAGGTCAAAGCAGCGGCGTCAAAAGTGAAGGCTAACTTGCTATCCGTAGAGGAAGCTTGCAGCCTGACGCCCCCACATTCAGCCAAATCCAAGTTTGGCTATGGGGCAAAAGACGTCCGTTGCCATGCCAGAAAGGCCGTAGCCCACATCAACTCCGTGTGGAAAGACCTTCTGGAAGACAGTGTAACACCAATAGACACTACCATCATGGCCAAGAACGAGGTTTTCTGCGTTCAGCCTGAGAAGGGGGGTCGTAAGCCAGCTCGTCTCATCGTGTTCCCCGACCTGGGCGTGCGCGTGTGCGAGAAGATGGCCCTGTACGACGTGGTTAGCAAGCTCCCCCTGGCCGTGATGGGAAGCTCCTACGGATTCCAATACTCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAAGTCCAAGAAGACCCCGATGGGGTTCTCGTATGATACCCGCTGTTTTGACTCCACAGTCACTGAGAGCGACATCCGTACGGAGGAGGCAATTTACCAATGTTGTGACCTGGACCCCCAAGCCCGCGTGGCCATCAAGTCCCTCACTGAGAGGCTTTATGTTGGGGGCCCTCTTACCAATTCAAGGGGGGAAAACTGCGGCTACCGCAGGTGCCGCGCGAGCGGCGTACTGACAACTAGCTGTGGTAACACCCTCACTTGCTACATCAAGGCCCGGGCAGCCTGTCGAGCCGCAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGACTTAGTCGTTATCTGTGAAAGTGCGGGGGTCCAGGAGGACGCGGCGAGCCTGAGAGCCTTCACGGAGGCTATGACCAGGTACTCCGCCCCCCCCGGGGACCCCCCACAACCAGAATACGACTTGGAGCTTATAACATCATGCTCCTCCAACGTGTCAGTCGCCCACGACGGCGCTGGAAAGAGGGTCTACTACCTTACCCGTGACCCTACAACCCCCCTCGCGAGAGCCGCGTGGGAGACAGCAAGACACACTCCAGTCAATTCCTGGCTAGGCAACATAATCATGTTTGCCCCCACACTGTGGGCGAGGATGATACTGATGACCCATTTCTTTAGCGTCCTCATAGCCAGGGATCAGCTTGAACAGGCTCTTAACTGTGAGATCTACGGAGCCTGCTACTCCATAGAACCACTGGATCTACCTCCAATCATTCAAAGACTCCATGGCCTCAGCGCATTTTCACTCCACAGTTACTCTCCAGGTGAAATCAATAGGGTGGCCGCATGCCTCAGAAAACTTGGGGTCCCGCCCTTGCGAGCTTGGAGACACCGGGCCCGGAGCGTCCGCGCTAGGCTTCTGTCCAGAGGAGGCAGGGCTGCCATATGTGGCAAGTACCTCTTCAACTGGGCAGTAAGAACAAAGCTCAAACTCACTCCAATAGCGGCCGCTGGCCGGCTGGACTTGTCCGGTTGGTTCACGGCTGGCTACAGCGGGGGAGACATTTATCACAGCGTGTCTCATGCCCGGCCCCGCTGGTTCTGGTTTTGCCTACTCCTGCTCGCTGCAGGGGTAGGCATCTACCTCCTCCCCAACCGA'}}, 'H77_genome':{'ns5b':{'nterm':7602,'cterm':9374,'seq':'TCAATGTCTTATTCCTGGACAGGCGCACTCGTCACCCCGTGCGCTGCGGAAGAACAAAAACTGCCCATCAACGCACTGAGCAACTCGTTGCTACGCCATCACAATCTGGTGTATTCCACCACTTCACGCAGTGCTTGCCAAAGGCAGAAGAAAGTCACATTTGACAGACTGCAAGTTCTGGACAGCCATTACCAGGACGTGCTCAAGGAGGTCAAAGCAGCGGCGTCAAAAGTGAAGGCTAACTTGCTATCCGTAGAGGAAGCTTGCAGCCTGACGCCCCCACATTCAGCCAAATCCAAGTTTGGCTATGGGGCAAAAGACGTCCGTTGCCATGCCAGAAAGGCCGTAGCCCACATCAACTCCGTGTGGAAAGACCTTCTGGAAGACAGTGTAACACCAATAGACACTACCATCATGGCCAAGAACGAGGTTTTCTGCGTTCAGCCTGAGAAGGGGGGTCGTAAGCCAGCTCGTCTCATCGTGTTCCCCGACCTGGGCGTGCGCGTGTGCGAGAAGATGGCCCTGTACGACGTGGTTAGCAAGCTCCCCCTGGCCGTGATGGGAAGCTCCTACGGATTCCAATACTCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAAGTCCAAGAAGACCCCGATGGGGTTCTCGTATGATACCCGCTGTTTTGACTCCACAGTCACTGAGAGCGACATCCGTACGGAGGAGGCAATTTACCAATGTTGTGACCTGGACCCCCAAGCCCGCGTGGCCATCAAGTCCCTCACTGAGAGGCTTTATGTTGGGGGCCCTCTTACCAATTCAAGGGGGGAAAACTGCGGCTACCGCAGGTGCCGCGCGAGCGGCGTACTGACAACTAGCTGTGGTAACACCCTCACTTGCTACATCAAGGCCCGGGCAGCCTGTCGAGCCGCAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGACTTAGTCGTTATCTGTGAAAGTGCGGGGGTCCAGGAGGACGCGGCGAGCCTGAGAGCCTTCACGGAGGCTATGACCAGGTACTCCGCCCCCCCCGGGGACCCCCCACAACCAGAATACGACTTGGAGCTTATAACATCATGCTCCTCCAACGTGTCAGTCGCCCACGACGGCGCTGGAAAGAGGGTCTACTACCTTACCCGTGACCCTACAACCCCCCTCGCGAGAGCCGCGTGGGAGACAGCAAGACACACTCCAGTCAATTCCTGGCTAGGCAACATAATCATGTTTGCCCCCACACTGTGGGCGAGGATGATACTGATGACCCATTTCTTTAGCGTCCTCATAGCCAGGGATCAGCTTGAACAGGCTCTTAACTGTGAGATCTACGGAGCCTGCTACTCCATAGAACCACTGGATCTACCTCCAATCATTCAAAGACTCCATGGCCTCAGCGCATTTTCACTCCACAGTTACTCTCCAGGTGAAATCAATAGGGTGGCCGCATGCCTCAGAAAACTTGGGGTCCCGCCCTTGCGAGCTTGGAGACACCGGGCCCGGAGCGTCCGCGCTAGGCTTCTGTCCAGAGGAGGCAGGGCTGCCATATGTGGCAAGTACCTCTTCAACTGGGCAGTAAGAACAAAGCTCAAACTCACTCCAATAGCGGCCGCTGGCCGGCTGGACTTGTCCGGTTGGTTCACGGCTGGCTACAGCGGGGGAGACATTTATCACAGCGTGTCTCATGCCCGGCCCCGCTGGTTCTGGTTTTGCCTACTCCTGCTCGCTGCAGGGGTAGGCATCTACCTCCTCCCCAACCGA'}}, 'JFH-1_genome':{'ns5b':{'nterm':7666,'cterm':9443,'seq':'CTCCATGTCATACTCCTGGACCGGGGCTCTAATAACTCCCTGTAGCCCCGAAGAGGAAAAGTTGCCAATCAACCCTTTGAGTAACTCGCTGTTGCGATACCATAACAAGGTGTACTGTACAACATCAAAGAGCGCCTCACAGAGGGCTAAAAAGGTAACTTTTGACAGGACGCAAGTGCTCGACGCCCATTATGACTCAGTCTTAAAGGACATCAAGCTAGCGGCTTCCAAGGTCAGCGCAAGGCTCCTCACCTTGGAGGAGGCGTGCCAGTTGACTCCACCCCATTCTGCAAGATCCAAGTATGGATTCGGGGCCAAGGAGGTCCGCAGCTTGTCCGGGAGGGCCGTTAACCACATCAAGTCCGTGTGGAAGGACCTCCTGGAAGACCCACAAACACCAATTCCCACAACCATCATGGCCAAAAATGAGGTGTTCTGCGTGGACCCCGCCAAGGGGGGTAAGAAACCAGCTCGCCTCATCGTTTACCCTGACCTCGGCGTCCGGGTCTGCGAGAAAATGGCCCTCTATGACATTACACAAAAGCTTCCTCAGGCGGTAATGGGAGCTTCCTATGGCTTCCAGTACTCCCCTGCCCAACGGGTGGAGTATCTCTTGAAAGCATGGGCGGAAAAGAAGGACCCCATGGGTTTTTCGTATGATACCCGATGCTTCGACTCAACCGTCACTGAGAGAGACATCAGGACCGAGGAGTCCATATACCAGGCCTGCTCCCTGCCCGAGGAGGCCCGCACTGCCATACACTCGCTGACTGAGAGACTTTACGTAGGAGGGCCCATGTTCAACAGCAAGGGTCAAACCTGCGGTTACAGACGTTGCCGCGCCAGCGGGGTGCTAACCACTAGCATGGGTAACACCATCACATGCTATGTGAAAGCCCTAGCGGCCTGCAAGGCTGCGGGGATAGTTGCGCCCACAATGCTGGTATGCGGCGATGACCTAGTAGTCATCTCAGAAAGCCAGGGGACTGAGGAGGACGAGCGGAACCTGAGAGCCTTCACGGAGGCCATGACCAGGTACTCTGCCCCTCCTGGTGATCCCCCCAGACCGGAATATGACCTGGAGCTAATAACATCCTGTTCCTCAAATGTGTCTGTGGCGTTGGGCCCGCGGGGCCGCCGCAGATACTACCTGACCAGAGACCCAACCACTCCACTCGCCCGGGCTGCCTGGGAAACAGTTAGACACTCCCCTATCAATTCATGGCTGGGAAACATCATCCAGTATGCTCCAACCATATGGGTTCGCATGGTCCTAATGACACACTTCTTCTCCATTCTCATGGTCCAAGACACCCTGGACCAGAACCTCAACTTTGAGATGTATGGATCAGTATACTCCGTGAATCCTTTGGACCTTCCAGCCATAATTGAGAGGTTACACGGGCTTGACGCCTTTTCTATGCACACATACTCTCACCACGAACTGACGCGGGTGGCTTCAGCCCTCAGAAAACTTGGGGCGCCACCCCTCAGGGTGTGGAAGAGTCGGGCTCGCGCAGTCAGGGCGTCCCTCATCTCCCGTGGAGGGAAAGCGGCCGTTTGCGGCCGATATCTCTTCAATTGGGCGGTGAAGACCAAGCTCAAACTCACTCCATTGCCGGAGGCGCGCCTACTGGACTTATCCAGTTGGTTCACCGTCGGCGCCGGCGGGGGCGACATTTTTCACAGCGTGTCGCGCGCCCGACCCCGCTCATTACTCTTCGGCCTACTCCTACTTTTCGTAGGGGTAGGCCTCTTCCTACTCCCCGCTCGGTAGA'}}} cpus = multiprocessing.cpu_count() local_path = os.getcwd() print "Beginning multiprocess indel QC with ",cpus," cpu's ...." ps.index(seqfile) bam = ps.Samfile(seqfile,'rb') outFASTQfile = open(bamfile+".indel_corrected.fastq",'w') ref = bam.references[0] read_pool = bam.fetch(bam.references[0], gene_pos[ref]['ns5b']['nterm'],gene_pos[ref]['ns5b']['cterm']) jobs = [] for read in read_pool: p = multiprocessing.Process(target=assignWork, args=(read,gene_pos[ref]['seq'],local_path)) jobs.append(p) p.start() for j in jobs: j.join() print "QC complete\n\n"
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam") localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted") samToBamFile(self.samFile, localBamFile) pysam.sort(localBamFile, localSortedBamFile) pysam.index(localSortedBamFile + ".bam") pysam.faidx(self.referenceFastaFile) file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] + "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1] consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf") consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq") system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \ % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf)) system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq)) system("rm -rf %s" % (self.referenceFastaFile + ".fai")) formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq") formatConsensusFastq(consensus_fastq, formatted_consensus_fastq) system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq)) self.finish()
def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None): logging.debug('LastzRunner: running on reference %s and query %s' % (reference_fasta_path, query_fasta_path)) output_sam_path = os.path.abspath( os.path.expandvars(output_bam_path.replace('.bam', '.sam'))) output_bam_unsorted_path = os.path.abspath( os.path.expandvars(output_bam_path + '.unsorted')) logging.debug( 'LastzRunner: aligning with output in temporary sam file %s' % output_sam_path) with open(output_sam_path, 'w') as output_sam_handler: for line in self._align(reference_fasta_path, query_fasta_path, multiple): output_sam_handler.write(line) logging.debug( 'LastzRunner: transforming sam into unsorted bam file %s' % output_bam_unsorted_path) input_sam_handler = pysam.Samfile(output_sam_path, "r") output_bam_file = pysam.Samfile( output_bam_unsorted_path, "wb", template=input_sam_handler) logging.debug( 'LastzRunner: copying from sam file to bam file') for s in input_sam_handler: output_bam_file.write(s) output_bam_file.close() logging.debug('LastzRunner: sorting and indexing bam file %s' % output_bam_path) pysam.sort(output_bam_unsorted_path, output_bam_path.replace('.bam', '')) pysam.index(output_bam_path)
def generate_bam_index(auxiliary_file_store_item_uuid, datafile_path): """ Generate a bam_index file and associate it with the auxiliary FileStoreItem from our generate_auxiliary_file task :param auxiliary_file_store_item_uuid: uuid of FileStoreItem to generate auxiliary file for :type auxiliary_file_store_item_uuid: string :param datafile_path: Full path on disk to the datafile that we want to generate a bam index file for :type datafile_path: string """ # Try and fetch the bam_index FileExtension # NOTE: that we are not handling the normal errors for an orm.get()s below # because we want the task from which this function is called within to # fail if we can't get what we want http://bit.ly/1KSbazM bam_index_file_extension = FileExtension.objects.get(name="bai").name auxiliary_file_store_item = FileStoreItem.objects.get( uuid=auxiliary_file_store_item_uuid) # Leverage pysam library to generate bam index file # FIXME: This should be refactored once we don't have a need for # Standalone IGV because this is creating a bam_index file in the same # directory as it's bam file pysam.index(bytes(datafile_path)) # Map source field of FileStoreItem to path of newly created bam index file auxiliary_file_store_item.source = "{}.{}".format( datafile_path, bam_index_file_extension) auxiliary_file_store_item.set_filetype(bam_index_file_extension) auxiliary_file_store_item.save() # Symlink the newly created bam index datafile auxiliary_file_store_item.symlink_datafile()
def saveReads(dataHub, nameExtra=None): if dataHub.args.save_reads: logging.info("* Saving relevant reads *") for i, sample in enumerate(dataHub): outbam_path = dataHub.args.save_reads if not outbam_path.endswith(".bam"): outbam_path += ".bam" if len(dataHub.samples) > 1: logging.debug("Using i = {}".format(i)) outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i)) if nameExtra is not None: outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra)) logging.info(" Outpath: {}".format(outbam_path)) # print out just the reads we're interested for use later bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam) for read in sample.reads: bam_small.write(read) for read in sample.readStatistics.reads: bam_small.write(read) bam_small.close() sorted_path = outbam_path.replace(".bam", ".sorted") pysam.sort(outbam_path, sorted_path) pysam.index(sorted_path+".bam")
def populate(self, sam_file_name, minimum_alignment_score): if self.contig == "": RuntimeError("contig must be set before reading a bam file") if self.contig[0]==">": current_contig_to_analyse = self.contig.lstrip('>') #Necessary because there is no ">" in the bam file... else: current_contig_to_analyse = self.contig sys.stderr.write("Loading file %s\n" %sam_file_name) samfile = pysam.Samfile(sam_file_name, 'rb') if not samfile._hasIndex(): #if no index, we must build it samfile.close() sys.stderr.write("Building index for %s\n" % sam_file_name) pysam.index(sam_file_name) samfile = pysam.Samfile(sam_file_name, 'rb') if self.position-3 < 0: sys.stderr.write("%s position %s. I have problem computing this position\n" % (self.contig, self.position)) for pileup_data in samfile.pileup(current_contig_to_analyse, max([0,self.position-3]), self.position+1): #print(str(self.position-3)+" "+str(pileup_data.pos)+" "+str(self.position+1)) if self.position-3 <= pileup_data.pos <= self.position+1: #print('in') for pileup_read in pileup_data.pileups: if not pileup_read.alignment.qname in self.reads: self.reads[pileup_read.alignment.qname] = {} if ord(pileup_read.alignment.qual[pileup_read.qpos])-33 > minimum_alignment_score: self.reads[pileup_read.alignment.qname][int(pileup_data.pos+1)] = \ pileup_read.alignment.seq[pileup_read.qpos] #using biological position, not python. samfile.close()
def tophat_map(gtf, out_dir, prefix, fastq, thread, bw=False, scale=False, gtf_flag=1): ''' 1. Map reads with TopHat2 2. Extract unmapped reads 3. Create BigWig file if needed ''' # tophat2 mapping print('Map reads with TopHat2...') tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 ' if gtf_flag: tophat_cmd += '-G %s ' % gtf tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat') tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + ','.join(fastq) tophat_cmd += ' 2> %s/tophat.log' % out_dir print('TopHat2 mapping command:') print(tophat_cmd) return_code = os.system(tophat_cmd) >> 8 if return_code: sys.exit('Error: cannot map reads with TopHat2!') # extract unmapped reads print('Extract unmapped reads...') unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir) unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir) # create Bigwig file if needed if bw and which('bedGraphToBigWig') is not None: print('Create BigWig file...') map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir # index bam if not exist if not os.path.isfile(map_bam_fname + '.bai'): pysam.index(map_bam_fname) map_bam = pysam.AlignmentFile(map_bam_fname, 'rb') # extract chrom size file chrom_size_fname = '%s/tophat/chrom.size' % out_dir with open(chrom_size_fname, 'w') as chrom_size_f: for seq in map_bam.header['SQ']: chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN'])) if scale: # scale to HPB mapped_reads = map_bam.mapped for read in map_bam: read_length = read.query_length break s = 1000000000.0 / mapped_reads / read_length else: s = 1 map_bam = pybedtools.BedTool(map_bam_fname) bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir with open(bedgraph_fname, 'w') as bedgraph_f: for line in map_bam.genome_coverage(bg=True, g=chrom_size_fname, scale=s, split=True): value = str(int(float(line[3]) + 0.5)) bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value) bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir return_code = os.system('bedGraphToBigWig %s %s %s' % (bedgraph_fname, chrom_size_fname, bigwig_fname)) >> 8 if return_code: sys.exit('Error: cannot convert bedGraph to BigWig!') else: print('Could not find bedGraphToBigWig, so skip this step!')
def splitByStrand(bamfile, pe): bam_prefix = bamfile.split(".bam")[0] if pe: flags = [('-f 0x40 -F 0x10', 'plus'), ('-f 0x40 -F 0x20', 'minus')] cmd_args = [['samtools', 'view', '-b', flag[0], bamfile, bam_prefix + "_" + flag[1] + ".bam"]for flag in flags] else: flags = [('-F 0x10', 'plus'), ('-f 0x10', 'minus')] cmd_args = [['samtools', 'view', '-b', flag[0], bamfile, bam_prefix + "_" + flag[1] + ".bam"]for flag in flags] for cmd_arg in cmd_args: print cmd_arg if os.path.exists(cmd_arg[5]): continue outfile = open(cmd_arg[5], 'w') p = Popen(cmd_arg[:5], stdout=outfile) p.wait() pysam.index(cmd_arg[5]) # Return split BAM names return([cmd_arg[5] for cmd_arg in cmd_args])
def indexed_bam(bam_filename): import pysam if not os.path.exists(bam_filename + ".bai"): pysam.index(bam_filename) sam_reader = pysam.Samfile(bam_filename, "rb") yield sam_reader sam_reader.close()
def convert_sam_to_bam(): """ This method should take a newly create .sam file from alignment and - convert it to .bam - sort .bam - index .bam """ ids = generate_ids() for id in ids: start_time = time() print 'converting: %s'%id base_path = os.path.join(SAMPLE_DIR, id) sam_path = os.path.join(base_path, id+'-bwape.sam') bam_path = os.path.join(base_path, id+'-bwape.bam') bam_content = pysam.view('-bS', sam_path) bam_file = open(bam_path, 'w+') bam_file.writelines(bam_content) bam_file.close() pysam.sort(bam_path, bam_path+'_sorted') pysam.index(bam_path+'_sorted.bam') # indexing creates file.bam.bam. Move it to file.bam bam_call = "mv {0} {1}".format(bam_path+'_sorted.bam', bam_path) index_call = "mv {0} {1}".format(bam_path+'_sorted.bam.bai', bam_path+'.bam.bai') subprocess.call(bam_call, shell=True) subprocess.call(index_call, shell=True) end_time = time() print 'completed: %.3fs'%(end_time-start_time)
def sort_bam(in_bam, sort_fn, to_include=None): out_file = "%s-ksort%s" % os.path.splitext(in_bam) index_file = "%s.bai" % in_bam if not os.path.exists(index_file): pysam.index(in_bam) orig = pysam.Samfile(in_bam, "rb") chroms = [(c["SN"], c) for c in orig.header["SQ"]] new_chroms = chroms[:] if to_include: new_chroms = [(c, x) for (c, x) in new_chroms if c in to_include] new_chroms.sort(sort_fn) remapper = _id_remapper(chroms, new_chroms) new_header = orig.header new_header["SQ"] = [h for (_, h) in new_chroms] new = pysam.Samfile(out_file, "wb", header=new_header) for (chrom, _) in new_chroms: for read in orig.fetch(chrom): write = True read.rname = remapper[read.rname] try: read.mrnm = remapper[read.mrnm] # read pair is on a chromosome we are not using except KeyError: assert to_include is not None write = False if write: new.write(read)
def extractRegion(bamfile,start,stop,output,exact): pysam.index(bamfile) # must create a .bai index for any bam file to be read or fetch won't work bam = pysam.Samfile(bamfile,'rb') # and must be done before bamfile is opened ref = bam.references[0] # Get name of reference reads aligned to in bam outfile = open(bamfile+".extracted."+output,'w') # Get the reads in region of interest read_pool = bam.fetch(bam.references[0], start,stop) # Process reads for read in read_pool: if exact != '': if read.pos <= start and read.aend >= stop: if output == 'fastq': outfile.write(writeFastQ(read)) elif output =='fasta': output.write(writeFastA(read)) else: if output == 'fastq': outfile.write(writeFastQ(read)) elif output == 'fasta': outfile.write(writeFastA(read)) outfile.close() return
def extractRegion(bamfile,start,stop,output): pysam.index(bamfile) # must create a .bai index for any bam file to be read or fetch won't work bam = pysam.Samfile(bamfile,'rb') # and must be done before bamfile is opened ref = bam.references[0] # Get name of reference reads aligned to in bam outfile = open(bamfile+".extracted."+output,'w') # Get the reads in region of interest read_pool = bam.fetch(bam.references[0], start,stop) # Process reads for read in read_pool: if read.is_reverse == True: # all reverse reads in a bam file have been reverse seq = Seq(read.query) # complemented already so they need to be reverse rc = seq.reverse_complement().tostring() # complemented again, along with the quality scores rq = reverseString(read.qqual) # to write correctly to the fastq if output == 'fastq': outfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n") elif output == 'fasta': outfile.write('>'+read.qname+'\n'+rc+'\n') else: if output == 'fastq': outfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n") elif output == 'fasta': outfile.write('>'+read.qname+'\n'+read.query+'\n') outfile.close() return
def read_directions_count(bam_file): """ get the reads directions count from a bam file @args bam_file: binary file formt for storing sequencing reads information @type bam_file: str """ ## indexing the in bam file if not os.path.exists(bam_file + ".bai"): pysam.index(bam_file) reverse_cnt = 0 forward_cnt = 0 bam_fh = pysam.Samfile(bam_file, "rb") for read in bam_fh.fetch(): if read.is_proper_pair and read.is_read1: if read.is_reverse: reverse_cnt += 1 else: forward_cnt += 1 bam_fh.close() return {'forward_reads_count': forward_cnt, 'reverse_reads_count': reverse_cnt}
def check_bam(bam, p, make_new_index=False): """ Sort and index bam file returns dictionary of chromosome names and lengths """ # check if sorted test_head = pysam.AlignmentFile(bam, 'rb') chrom_sizes = {} p = str(p) for i in test_head.header['SQ']: chrom_sizes[i['SN']] = int(i['LN']) try: test_head.header['HD']['SO'] except KeyError: print ' sorting bam file' pysam.sort('-@', p, bam, 'sorted.temp') os.remove(bam) os.rename('sorted.temp.bam', bam) else: if test_head.header['HD']['SO'] == 'coordinate': pass else: print ' sorting bam file' pysam.sort('-@', p, bam, 'sorted.temp') os.remove(bam) os.rename('sorted.temp.bam', bam) test_head.close() # check if indexed if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False: pass else: print ' indexing bam file' pysam.index(bam) return chrom_sizes
def main(**args): dbname = args["genomedatabase"] samplefilename = args["samplefile"] sampledata = samplefile(args["samplefile"]) expname = args["expname"] trackdir = expname+"/trackhub" scriptdir = os.path.dirname(os.path.realpath(sys.argv[0]))+"/" if not os.path.exists(trackdir): os.makedirs(trackdir) allsamples = sampledata.getsamples() for currsample in allsamples: currbam = sampledata.getbam(currsample) genomebam = currsample+"-genome.bam" convertbam(dbname, currbam, genomebam, scriptdir, force = True) faidxjob = subprocess.Popen("samtools faidx "+dbname+"-tRNAgenome.fa",shell = True) faidxjob.wait() for currrep in sampledata.allreplicates(): repsamples = sampledata.getrepsamples(currrep) samtoolsmerge(list(curr+"-genome.bam" for curr in repsamples), currrep+"-mergegenome.bam", True) pysam.index(currrep+"-mergegenome.bam") makebigwigs(currrep+"-mergegenome.bam", currrep, dbname+"-tRNAgenome.fa.fai",trackdir) createtrackdb(sampledata.allreplicates(),expname)
def sort_by_position(bam_file, dir): ## get the file prefix prefix = "" prefix_match = re.match(r"(.*).bam", bam_file) try: prefix = prefix_match.group(1) except: print "Existing: Invalid bam file -i %s" %(bam_file) sys.exit(2) # sort the bam file bam_input = dir + bam_file sort_bam = dir + prefix + "_sorted" pysam.sort(bam_input, sort_bam) sort_bam = sort_bam + ".bam" # index the sort bam file pysam.index(sort_bam) print "" print "Writing Sorted Bam File : %s" %(sort_bam) print "Writing Index Sorted Bam File : %s.bai" %(sort_bam) return sort_bam
def sort_output(outPrefix): '''Sorts the output file by read coordinate''' pysam.sort(outPrefix+'.originalSort.bam', outPrefix + '.coordSort') #os.remove(outPrefix+'.originalSort.tmp.bam') ## Build the bam index for output pysam.index(outPrefix + '.coordSort.bam')
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path): work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null) with open(pe1_output, "w") as pe1_file: subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null) with open(pe2_output, "w") as pe2_file: subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "sampe", "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout=bwa_file, stderr=null) sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam')
def bwa_mem(pe1_path, pe2_path, genome_path, threads, output_path): print 'Aligning with bwa mem' start = time() work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") stderr_file = open(output_path+'.bwa.1','w') #null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=stderr_file) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "mem", "-t", threads, genome_db, pe1_path, pe2_path ], stdout=bwa_file, stderr=stderr_file) elapsed = time() - start print 'Time elapsed for bwa mem: ', elapsed sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam') shutil.rmtree(work_dir)
def bamFile(tmpdir_factory): header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}] } p=tmpdir_factory.mktemp('test').join('test.bam') outFile=pysam.AlignmentFile(str(p),"wb",header=header) a = pysam.AlignedSegment() a.query_name = "read3" a.query_sequence="GGGGAAAAAT" a.reference_start = 28 a.reference_id = 0 a.mapping_quality = 20 a.cigar = ((0,10), ) #a.query_qualities = pysam.qualitystring_to_array("((((((((((") a.flag=16 outFile.write(a) a.query_name = "read2" a.reference_start = 32 a.query_sequence="AAAAATTTTT" a.flag=0 outFile.write(a) a.query_name = "read1" a.query_sequence="TTAAAAACCCCCGGC" #a.query_qualities = pysam.qualitystring_to_array("(((((((((((((") a.cigar = ((5,5),(4,2),(0,10), (2,2),(0,1),(1,1),(0,1)) outFile.write(a) outFile.close() pysam.index(str(p)) return(p)
def run_cufflinks(org_db, num_cpus=4): """ run cufflinks program on mapped reads """ try: subprocess.call(["cufflinks"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `Cufflinks` binary is in your $PATH") org_name = org_db['short_name'] print("preparing for cufflinks run for organism %s" % org_name) min_intron_length = 20 min_isoform_frac = 0.25 max_intron_length = org_db['max_intron_len'] result_dir = org_db['read_assembly_dir'] bam_file = "%s/%s_Aligned_mmr_sortbyCoord.bam" % (org_db['read_map_dir'], org_name) if not os.path.isfile(bam_file): sys.stdout.write("failed to fetch sorted mmr BAM file for organism: %s, trying to get the mmr file...\n" % org_name) bam_file = "%s/%s_Aligned_mmr.bam" % (org_db['read_map_dir'], org_name) if not os.path.isfile(bam_file): exit("error: failed to fetch mmr BAM file for organism %s" % org_name) ## sorting, indexing the bam file file_prefix, ext = os.path.splitext(bam_file) sorted_bam = "%s_sortbyCoord" % file_prefix sys.stdout.write("trying to sort based by the coordinates with output prefix as: %s\n" % sorted_bam) if not os.path.isfile("%s.bam" % sorted_bam): pysam.sort(bam_file, sorted_bam) bam_file = "%s.bam" % sorted_bam print('using bam file from %s' % bam_file) if not os.path.exists(bam_file + ".bai"): pysam.index(bam_file) ## always use quiet mode to avoid problems with storing log output. cli_cuff = "cufflinks -q --no-update-check \ -F %.2f \ -I %d \ --min-intron-length %d \ --library-type fr-unstranded \ -p %d \ -o %s \ %s" % (min_isoform_frac, max_intron_length, min_intron_length, num_cpus, result_dir, bam_file) sys.stdout.write('\trun cufflinks as: %s \n' % cli_cuff) try: os.chdir(result_dir) process = subprocess.Popen(cli_cuff, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode except Exception, e: print 'Error running cufflinks.\n%s' % str( e )
def buildSimpleNormalizedBAM(infiles, outfile, nreads): '''normalize a bam file to given number of counts by random sampling ''' infile, countfile = infiles pysam_in = pysam.Samfile(infile, "rb") fh = IOTools.openFile(countfile, "r") readcount = int(fh.read()) fh.close() threshold = float(nreads) / float(readcount) pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) # iterate over mapped reads thinning by the threshold ninput, noutput = 0, 0 for read in pysam_in.fetch(): ninput += 1 if random.random() <= threshold: pysam_out.write(read) noutput += 1 pysam_in.close() pysam_out.close() pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, nreads))
def addReadGroupSet(self, datasetName, filePath, moveMode): """ Add a read group set to the repo """ # move the bam file self._check() self._checkDataset(datasetName) self._checkFile(filePath, self.bamExtension) fileName = os.path.basename(filePath) readGroupSetName = filenameWithoutExtension( fileName, self.bamExtension) destPath = os.path.join( self._repoPath, self.datasetsDirName, datasetName, self.readsDirName, fileName) self._assertPathEmpty(destPath, inRepo=True) self._moveFile(filePath, destPath, moveMode) # move the index file if it exists, otherwise do indexing indexPath = os.path.join( os.path.split(filePath)[0], readGroupSetName + self.bamIndexExtension) indexMessage = "" if os.path.exists(indexPath): dstDir = os.path.split(destPath)[0] self._moveFile( indexPath, os.path.join(dstDir, os.path.basename(indexPath)), moveMode) else: pysam.index(destPath.encode('utf-8')) indexMessage = " (and indexed)" # finish self._repoEmit("ReadGroupSet '{}' added to dataset '{}'{}".format( fileName, datasetName, indexMessage))
def bwa_sampe(pe1_path, pe2_path, genome_path, output_path): print 'Aligning with bwa aln/sampe' start = time() work_dir = tempfile.mkdtemp() genome_db = os.path.join(work_dir, "genome") pe1_output = os.path.join(work_dir, "pe1.sai") pe2_output = os.path.join(work_dir, "pe2.sai") bwa_output = os.path.join(work_dir, "output.sam") null = open("/dev/null") subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null) with open(pe1_output, "w") as pe1_file: subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null) with open(pe2_output, "w") as pe2_file: subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null) with open(bwa_output, "w") as bwa_file: subprocess.check_call([ "bwa", "sampe", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout=bwa_file, stderr=null) elapsed = time() - start print 'Time elapsed for bwa aln/sampe: ', elapsed sam_to_bam(bwa_output, bwa_output + ".bam") pysam.sort(bwa_output + ".bam", output_path) pysam.index(output_path + '.bam')
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path, args): work_dir = tempfile.mkdtemp( ) genome_db = os.path.join( work_dir, "genome" ) pe1_output = os.path.join( work_dir, "pe1.sai" ) pe2_output = os.path.join( work_dir, "pe2.sai" ) bwa_output = os.path.join( work_dir, "output.sam" ) null = open( "/dev/null" ) #open("/tmp/bwa_out")# subprocess.check_call( [ "bwa", "index", "-p", genome_db, genome_path ], stderr = null ) with open( pe1_output, "w" ) as pe1_file: subprocess.check_call( [ "bwa", "aln", genome_db, pe1_path ], stdout = pe1_file, stderr = null ) with open( pe2_output, "w" ) as pe2_file: subprocess.check_call( [ "bwa", "aln", genome_db, pe2_path ], stdout = pe2_file, stderr = null ) with open( bwa_output, "w" ) as bwa_file: subprocess.check_call( [ "bwa", "sampe", "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db, pe1_output, pe2_output, pe1_path, pe2_path ], stdout = bwa_file, stderr = null ) if args.sam: shutil.move(bwa_output ,output_path+'.sam') #os.rename(bwa_output ,output_path+'.sam') else: sam_to_bam( bwa_output, bwa_output + ".bam" ) if args.sort: # coordinate sort the file pysam.sort( bwa_output + ".bam", output_path ) pysam.index(output_path+'.bam') else: shutil.move(bwa_output +".bam",output_path+'.bam')
def convert(self): # set flags if self.inputFileFormat == AlignmentFileConstants.SAM: inputFlags = "r" elif self.inputFileFormat == AlignmentFileConstants.BAM: inputFlags = "rb" if self.outputFileFormat == AlignmentFileConstants.SAM: outputFlags = "wh" elif self.outputFileFormat == AlignmentFileConstants.BAM: outputFlags = "wb" # open files inputFile = pysam.AlignmentFile( self.args.inputFile, inputFlags) outputFile = pysam.AlignmentFile( self.args.outputFile, outputFlags, header=inputFile.header) outputFilePath = outputFile.filename log("Creating alignment file '{}'".format(outputFilePath)) # write new file for _ in xrange(self.args.numLines): alignedSegment = inputFile.next() outputFile.write(alignedSegment) # clean up inputFile.close() outputFile.close() # create index file if (not self.args.skipIndexing and self.outputFileFormat == AlignmentFileConstants.BAM): indexFilePath = "{}.{}".format( outputFilePath, AlignmentFileConstants.BAI.lower()) log("Creating index file '{}'".format(indexFilePath)) pysam.index(outputFilePath)
def _generate_empty_bam_file(self, sam_path, bam_path_prefix): samfile = pysam.Samfile(sam_path, "r") bamfile = pysam.Samfile( "%s.bam" % bam_path_prefix, "wb", header=samfile.header) bamfile.close() samfile.close() pysam.index("%s.bam" % bam_path_prefix)
def build_index(self): msg = "Building index %s" % self.baifn print(msg) pysam.index(self.bamfn)
def main(): parser = argparse.ArgumentParser( description='Count reads mapped to genes and exons') parser.add_argument('annotation', type=str, help='annotation file') parser.add_argument('-b', '--inp', type=str, nargs='+', help='input bam files') parser.add_argument('-o', '--out', type=str, help='output file') parser.add_argument('-n', '--nproc', type=int, default=1, help='number of processes') parser.add_argument('-l', '--names', type=str, nargs='+', help='input track names', required=False, default=[]) parser.add_argument("-s", '--strand', help='strand', choices=['+', '-']) # group1 = parser.add_mutually_exclusive_group() # group1.add_argument('-e', action="store_true", # help='count exon/intron hits') # group1.add_argument('-t', action="store_true", # help='count transcript hits') # group1.add_argument('-g', action="store_true", # help='count gene hits') # # group2 = parser.add_mutually_exclusive_group() # group2.add_argument('-i', action="store_true", # help='only introns') # group2.add_argument('-e', action="store_true", # help='only exons') # group2.add_argument('-t', action="store_true", # help='all (begin-end)') parser.add_argument( '-f', '--feature', choices=('genes', 'transcripts', 'exons', 'introns', 'exonsnoagg'), required=True, help='Output one value per gene/transcript/exon/intron') parser.add_argument( '-i', '--include', choices=('all', 'exons', 'introns'), default='exons', help='For genes count only reads mapped to exons/introns/both. \ Has no efect if feature==exon or intron ') args = parser.parse_args() if args.nproc < 1: args.nproc = 1 if len(args.inp) > len(args.names): args.names += args.inp[len(args.names):] print "Counting samples named:", args.names # try to open before running filea = open(args.annotation, "r") filea.close() fileo = open(args.out, "w+") filesi = [] for bam in args.inp: f = open(bam) f.close() name = args.names.pop(0) fileo.write("\t%s" % name) if not os.path.exists(bam + ".bai"): print "Indexing bam file %s" % bam pysam.index(bam) filesi += [(bam, name)] fileo.write("\n") counttype = (args.feature, args.include) count(args.annotation, filesi, fileo, args.strand, counttype, args.nproc) fileo.close()
def main(): parser = argparse.ArgumentParser( description= 'Bams need to be filtered by UMI and MAPQ before we do anything...') parser.add_argument('infile', metavar='INFILE', help='bam of a single cell') parser.add_argument('outfile', metavar='OUTFILE', help='bam filtered by UMI and MAPQ') parser.add_argument( 'outfilesorted', metavar='OUTFILE_SORTED', help='bam filtered by UMI and MAPQ, sorted and indexed') parser.add_argument('--quiet', '-q', action='store_true', help='Suppress some print statements') parser.add_argument('--logfile', '-l', metavar='LOGFILE', default=None, help='Write arguments to logfile') parser.add_argument( '--mapq_thres', '-m', type=int, default=30, help='Minimum mapping quality. Default 30. bwa output is 0-60?') parser.add_argument( '--umi_window', '-w', type=int, default=1000, help= 'When checking UMI, throw out read if things have been read within umi_window basepairs away. Default 1000 to handle paired end?' ) parser.add_argument( '--umi_indx', '-i', type=int, default=16, help= 'umi indx to get from header. Assumes header format of semi-colon separated followed by colon separated of 2 terms' ) parser.add_argument('--bc_indx', '-I', type=int, default=13, help='bc indx to get from header') parser.add_argument('--dumpfile', '-d', metavar='DUMPBAM', default=None, help='bam output of UMI duplicates') parser.add_argument('--no_prefix', '-p', action='store_true', help='If set, then remove chr from chromosome names') parser.add_argument('--ignore_truncation', action='store_true', help='If set, then ignore truncation') parser.add_argument('--add_prefix', action='store_true', help='Add chr to chromosome names') parser.add_argument('--debug', action='store_true', help='Debug') parser.add_argument('--genome', '-g', metavar='mm or hs', default='mm', help='Genome is mm or hs to define chromosomes') args = parser.parse_args() # store command line arguments for reproducibility CMD_INPUTS = ' '.join(['python'] + sys.argv) # easy printing later # store argparse inputs for reproducibility / debugging purposes args_dic = vars(args) # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()] # for python2 ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.items()] # for python3 ARG_INPUTS = ' '.join(ARG_INPUTS) # chromos = [''.join(['chr', str(i + 1)]) for i in range(20)] + ['chrX', 'chrY', 'chrM'] if not args.no_prefix: chromos = [''.join(['chr', str(i + 1)]) for i in range(22)] + ['chrX', 'chrY', 'chrM'] else: chromos = [''.join(['', str(i + 1)]) for i in range(22)] + ['X', 'Y', 'M'] chromos_set = set(chromos) bad_chromos = set() print(chromos) umi_dic = {} # UMIs are umi_dic_pos = {} # UMIs positions umi_dic_bin = {} # UMIs by bins, by chromosome badreads = 0 badreadsumi = 0 goodcounts = 0 badchromo = 0 with pysam.AlignmentFile(args.infile, "rb", ignore_truncation=args.ignore_truncation) as bamf: if args.add_prefix: bamf.references = [''.join(['chr', x]) for x in bamf.references] with pysam.AlignmentFile(args.outfile, "wb", template=bamf) as outbam: if args.dumpfile is not None: dumpbam = pysam.AlignmentFile(args.dumpfile, "wb", template=bamf) for totalcounts, read in enumerate(bamf): if read.mapping_quality < args.mapq_thres: # throw out bad reads badreads += 1 continue # get UMI-Barcode # umibc = get_umibc(read) umibc = get_umibc_longheader(read) if (args.debug): print(umibc) input("press any key") chromo = read.reference_name if chromo not in chromos_set: # throw out bad chromo bad_chromos.add(chromo) badchromo += 1 continue pos = read.reference_start # left most pos, 0-based coord = ':'.join([chromo, str(pos)]) # get bin within 1kb bin = coord_to_bin(pos) end = get_end( read ) # Positive or Negative depends on fragment from paired end if umibc not in umi_dic: # initialize R1 and R2 umi_dic[umibc] = { 'R1': 0, 'R2': 0 } # keep track of reads, they are also indexes! # umi_dic_pos[umibc] = {'R1': [], 'R2' : []} # keep track of positions umi_dic_bin[umibc] = { 'R1': {}, 'R2': {} } # track bins for UMI counting for c in chromos: for end in ['R1', 'R2']: umi_dic_bin[umibc][end][c] = set() else: # update umi_dics # if umibc counted in same bin in same end (R1 or R2) then it's bad if bin in umi_dic_bin[umibc][get_end(read)][chromo]: # umi_dic_pos[umibc][get_end(read)].append(coord) # record the bad read umi_dic[umibc][get_end( read)] += 1 # record duplicate reads # already in bin, then don't add it badreadsumi += 1 dumpbam.write(read) continue # reads here are unique (within a window) and high quality, write them to outbam and move on # umi_dic_pos[umibc][get_end(read)].append(coord) # only append if it's a bad read umi_dic_bin[umibc][get_end(read)][chromo].add(bin) if (args.debug): print("Good read, adding to dictionary") goodcounts += 1 outbam.write(read) if args.dumpfile is not None: dumpbam.close() # sort and index bam pysam.sort('-o', args.outfilesorted, args.outfile) pysam.index(args.outfilesorted) # remove temporarily file os.remove(args.outfile) # Print arguments supplied by user. Ideally as log file because of the \n if not args.quiet: if args.logfile is not None: sys.stdout = open(args.logfile, "w+") print(datetime.datetime.now().strftime('Code output on %c')) print('\n') print('Command line inputs:') print('\n') print(CMD_INPUTS) print('\n') print('Argparse variables:') print('\n') print(ARG_INPUTS) print('\n') # list total counts print("Reads from these chromosomes thrown out: %s" % bad_chromos) print("Total counts: %s" % totalcounts) print('\n') print("High quality, unique counts: %s" % goodcounts) print('\n') print("Bad quality counts: %s" % badreads) print('\n') # write duplicate UMI counts as a table for umibc in umi_dic: if umi_dic[umibc]['R1'] > 1 and umi_dic[umibc]['R2'] > 1: # print('%s\t%s\t%s\t%s\%s\n' %\ # (umibc, umi_dic[umibc]['R1'], # umi_dic[umibc]['R2'], # ','.join(umi_dic_pos[umibc]['R1']), # ','.join(umi_dic_pos[umibc]['R2']))) print('%s\t%s\t%s\n' %\ (umibc, umi_dic[umibc]['R1'], umi_dic[umibc]['R2']))
def vcf_from_fasta(args): """Entry point for calling variants by consensus sequence alignment.""" logger = medaka.common.get_named_logger('CONS2VCF') with pysam.FastaFile(args.ref_fasta) as fasta: ref_seqs = {name: fasta.fetch(name) for name in fasta.references} contig_lengths = dict(zip(fasta.references, fasta.lengths)) total_bp = sum(fasta.lengths) ref_contigs = fasta.references h = pysam.AlignmentHeader().from_references(fasta.references, fasta.lengths) if args.bam is not None: alns = pysam.AlignmentFile(args.bam) out_bam = None else: out_bam = pysam.AlignmentFile(args.out_prefix + '.bam', 'wb', header=h) if args.regions is not None: contigs = [r.ref_name for r in args.regions] else: contigs = None alns = edlib_chunked_align_fastas(args.consensus, args.ref_fasta, contigs, chunk_size=args.chunk_size, pad=args.pad, mode=args.mode, header=h) vcf_fp = args.out_prefix + '.vcf' trees = collections.defaultdict(intervaltree.IntervalTree) t_log = now() log_interval = 5 msg = 'Processed {:.2%} of reference.' bp_done = collections.Counter() header_contigs = [ '{},length={}'.format(c, contig_lengths[c]) for c in ref_contigs ] meta_info = [ medaka.vcf.MetaInfo('FORMAT', 'GT', 1, 'String', 'Medaka genotype.') ] with medaka.vcf.VCFWriter(vcf_fp, contigs=header_contigs, meta_info=meta_info) as writer: for aln in alns: # reference_start is 0 based, reference_end points to one past # the last aligned residue, i.e. same as bed file ref = aln.reference_name rstart, rend = aln.reference_start, aln.reference_end if trees[ref].overlaps(rstart, rend) and args.bam is not None: # We expect edlib alignments to overlap by 1 match so only # apply this check for a user-provided bam. logger.warning( ('WARNING: alignment {}:{}-{} overlaps another ' + 'alignment, which could cause overlapping variants.' + '\nCheck output bam and vcf for details.').format( ref, rstart, rend)) trees[ref].add(intervaltree.Interval(rstart, rend)) for v in yield_variants_from_aln(aln, ref_seqs[ref]): if 'N' in v.ref: continue writer.write_variant(v) if now() - t_log > log_interval: done = bp_done[ref] + v.pos - rstart logger.info(msg.format(done / total_bp)) t_log = now() bp_done[ref] += rend - rstart if out_bam is not None: out_bam.write(aln) if out_bam is not None: out_bam.close() pysam.index(out_bam.filename) bed_fp = args.out_prefix + '_coverage.bed' gap_bed_fp = args.out_prefix + '_coverage_gaps.bed' for tree in trees.values(): # strict=False to merge abutting alignments. tree.merge_overlaps(strict=False) medaka.common.write_intervaltrees_to_bed(trees, bed_fp) gap_trees = medaka.common.complement_intervaltrees(trees, contig_lengths) medaka.common.write_intervaltrees_to_bed(gap_trees, gap_bed_fp) # loop over contigs for which we have alignments checking for gaps for contig in trees: if len(gap_trees[contig]): logger.info(('WARNING: There are alignment gaps for ref contig' + ' {}, see bed files for details.').format(contig)) if len(ref_contigs) != len(trees): logger.info('WARNING: Some contigs have no alignments, see bed files' + ' for details.') # bp_done calculated above does not take account of overlapping alignments # hence recalculate here based on merged alignment intervals. aligned_bp = sum((i.length() for tree in trees.values() for i in tree)) msg = 'Alignments spanned {:%} of the reference.' logger.info(msg.format(aligned_bp / total_bp)) msg = 'Check bed files {} and {} for alignment coverage and gaps.' logger.info(msg.format(bed_fp, gap_bed_fp)) logger.info('All done. VCF written to {}.'.format(vcf_fp))
def index_bam(bam_fpath): 'It indexes a bam file' pysam.index(bam_fpath)
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print "applying correction" # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print "genome partition size for multiprocessing: {}".format(chunkSize) print "using region {}".format(args.region) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) print chrNameBitToBam, chrNameBamToBit c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in xrange(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print "no sequence information for " "chromosome {} in 2bit file".format(chrom) print "Reads in this chromosome will be skipped" continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = map(writeCorrectedSam_wrapper, mp_args) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print "concatenating (sorted) intermediate BAMs" header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print "indexing BAM" pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = map(writeCorrected_wrapper, mp_args) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(x, bit[x].size) for x in bit.keys()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
import sys import re import itertools import numpy as np import matplotlib.pyplot as plt from Bio import SeqIO import pysam import runCmd if sys.version_info[0] < 3: from StringIO import StringIO else: from io import StringIO if not os.path.exists(args.bam + "bai"): pysam.index(args.bam, args.bam + ".bai") bamfile = pysam.AlignmentFile(args.bam, "rb") topLen = pysam.AlignmentFile(args.outbam, "wb", template=bamfile) lengths = [] for read in bamfile.fetch(): lengths.append(read.infer_query_length()) lengths = np.sort(np.array(lengths)) # print a histogram of lengths length = len(lengths) for i in range(1, 11): posStart = (i - 1) * length / 10 posEnd = i * length / 10 - 1 print(
def main(mode, input, output, name, ncores, bowtie2_index, cluster, jobs, peaks_file, by_rgid, peak_width, keep_duplicates, max_javamem, trash_mito, reference_genome, very_sensitive, clipl, clipr, py_trim, keep_temp_files, skip_fastqc, overwrite, bedtools_genome, blacklist_file, tss_file, macs2_genome_size, bs_genome, bedtools_path, bowtie2_path, java_path, macs2_path, samtools_path, r_path): """ proatac: a toolkit for PROcessing ATAC-seq data. \n Caleb Lareau, Buenrostro Lab. \n modes = ['bulk', 'check', 'counts', 'indexSplit', 'single', 'summitsToPeaks', 'support']\n See http://proatac.readthedocs.io for more details. """ __version__ = get_distribution('proatac').version script_dir = os.path.dirname(os.path.realpath(__file__)) click.echo(gettime() + "Starting proatac pipeline v%s" % __version__) # Determine which genomes are available rawsg = os.popen('ls ' + script_dir + "/anno/bedtools/*.sizes").read().strip().split("\n") supported_genomes = [ x.replace(script_dir + "/anno/bedtools/chrom_", "").replace(".sizes", "") for x in rawsg ] if (mode == "support"): click.echo(gettime() + "List of built-in genomes supported in proatac:") click.echo(gettime() + str(supported_genomes)) sys.exit( gettime() + 'Specify one of these genomes or provide your own files (see documentation).' ) # Take a collection of summits files and return a consensus set of peaks if (mode == 'summitsToPeaks'): click.echo(gettime() + "Starting inference of peaks from summits.") # Need chromosome sizes and blacklist bedtoolsGenomeFile, blacklistFile = getBfiles(bedtools_genome, blacklist_file, reference_genome, script_dir, supported_genomes) # Figure out which samples to process bedFiles = os.popen("ls " + input.rstrip("/") + "/*summits.bed*").read().strip().split("\n") if (len(bedFiles) < 1): sys.exist("No summit *summits.bed* files found; QUITTING") else: click.echo(gettime() + "Calling peaks from these samples:") click.echo(gettime() + str(bedFiles)) # Verify dependencies R = get_software_path('R', r_path) check_R_packages(['data.table', 'GenomicRanges', 'tools'], R) # Execute software make_folder(output) summitRcall = " ".join([ R + "script", script_dir + "/bin/R/summitsToCleanPeaks.R", ",".join(bedFiles), peak_width, blacklistFile, bedtoolsGenomeFile, str(999999999), str(0.01), output, name ]) os.system(summitRcall) click.echo(gettime() + "Completed peak inference from summit files.") sys.exit() # TO DO: # Make a mode to handle split-pool data if (mode == 'indexSplit'): sys.exit("Mode does not actually work yet") # Make a counts table from user-supplied peaks and bam files if (mode == 'counts'): click.echo( gettime() + "Attempting to assemble counts table from user-specified input.") # Verify dependencies R = get_software_path('R', r_path) check_R_packages(['chromVAR', 'SummarizedExperiment', 'tools'], R) # Make sure that there are samples to process / there is a peak file bamfiles = os.popen("ls " + input.rstrip("/") + "/*.bam").read().strip().split("\n") if (len(bamfiles) < 1): sys.exist( "No sample *.bam files found in user-specified input; QUITTING" ) else: click.echo(gettime() + "Making a counts table from these samples:") click.echo(gettime() + str(bamfiles)) if (os.path.isfile(peaks_file)): click.echo(gettime() + "Found peaks file: " + peaks_file) # Execute software make_folder(output) countsRcall = " ".join([ R + "script", script_dir + "/bin/R/makeCountsTable.R", input, peaks_file, str(by_rgid), output, name ]) os.system(countsRcall) click.echo(gettime() + "Completed peak inference from summit files.") sys.exit() # Last minute changes if (very_sensitive): very_sensitive = "--very-sensitive " else: very_sensitive = "" p = proatacProject( script_dir, supported_genomes, mode, input, output, name, ncores, bowtie2_index, cluster, jobs, peak_width, keep_duplicates, max_javamem, trash_mito, reference_genome, very_sensitive, clipl, clipr, py_trim, keep_temp_files, skip_fastqc, overwrite, bedtools_genome, blacklist_file, tss_file, macs2_genome_size, bs_genome, bedtools_path, bowtie2_path, java_path, macs2_path, samtools_path, r_path) if (mode == "check"): click.echo(gettime() + "Dependencies and user-reported file paths OK") click.echo( "\nproatac will process the following samples / files with bulk / single specified: \n" ) print("Sample", "Fastq1", "Fastq2") for x in range(len(p.samples)): print(p.samples[x], p.fastq1[x], p.fastq2[x]) click.echo( "\nIf this table doesn't look right, consider specifying a manually created sample input table (see documentation).\n" ) sys.exit(gettime() + "Successful check complete; QUITTING.") # Single or bulk processing if (mode == "single" or mode == "bulk"): # Potentially submit jobs to cluster if (ncores == "detect"): ncores = str(available_cpu_count()) else: ncores = str(ncores) snakeclust = "" njobs = int(jobs) if (njobs > 0 and cluster != ""): snakeclust = " --jobs " + jobs + " --cluster '" + cluster + "' " click.echo( gettime() + "Recognized flags to process jobs on a computing cluster.") # Make output folders of = output logs = of + "/logs" fin = of + "/final" trim = of + "/01_trimmed" aligned = of + "/02_aligned_reads" processed = of + "/03_processed_reads" qc = of + "/04_qc" folders = [ of, logs, fin, trim, aligned, processed, qc, of + "/.internal/parseltongue", of + "/.internal/samples", logs + "/bowtie2", logs + "/trim", logs + "/macs2", of + "/03_processed_reads/temp", fin + "/plots" ] mkfolderout = [make_folder(x) for x in folders] make_folder(logs + "/picard") make_folder(logs + "/picard/inserts") make_folder(logs + "/tss") make_folder(logs + "/samples") make_folder(of + "/mito") if not keep_duplicates: make_folder(logs + "/picard/markdups") if not skip_fastqc: make_folder(logs + "/fastqc") if (mode == "bulk"): make_folder(of + "/final/bams") make_folder(of + "/final/summits") make_folder(of + "/04_qc/macs2_each") if (mode == "single"): make_folder(of + "/03_processed_reads/bams") # Create internal README files if not os.path.exists(of + "/.internal/README"): with open(of + "/.internal/README", 'w') as outfile: outfile.write( "This folder creates important (small) intermediate; don't modify it.\n\n" ) if not os.path.exists(of + "/.internal/parseltongue/README"): with open(of + "/.internal/parseltongue/README", 'w') as outfile: outfile.write( "This folder creates intermediate output to be interpreted by Snakemake; don't modify it.\n\n" ) if not os.path.exists(of + "/.internal/samples/README"): with open(of + "/.internal" + "/samples" + "/README", 'w') as outfile: outfile.write( "This folder creates samples to be interpreted by Snakemake; don't modify it.\n\n" ) # Create promoter file: ptss = of + "/.internal/promoter.tss.bed" if not os.path.exists(ptss): os.system('''awk '{print $1"\t"$2-2000"\t"$3+2000"\t"$4}' ''' + p.tssFile + " > " + ptss) # Set up sample bam plain text file for i in range(len(p.samples)): with open( of + "/.internal/samples/" + p.samples[i] + ".fastqs.txt", 'w') as outfile: outfile.write(p.fastq1[i] + "\t" + p.fastq2[i]) y_s = of + "/.internal/parseltongue/proatac.object.yaml" with open(y_s, 'w') as yaml_file: yaml.dump(dict(p), yaml_file, default_flow_style=False, Dumper=yaml.RoundTripDumper) snakecmd_scatter = 'snakemake' + snakeclust + ' --snakefile ' + script_dir + '/bin/snake/Snakefile.proatac.scatter --cores ' + ncores + ' --config cfp="' + y_s + '" ' os.system(snakecmd_scatter) if (mode == 'single'): # Merge into one .bam file: finalmergedbam = fin + "/" + p.name + ".merged.bam" if not os.path.isfile(finalmergedbam): os.system(p.samtools + " merge " + finalmergedbam + " " + of + "/03_processed_reads/bams/*.bam") pysam.index(finalmergedbam) snakecmd_gather = 'snakemake --snakefile ' + script_dir + '/bin/snake/Snakefile.proatac.gather --cores ' + ncores + ' --config cfp="' + y_s + '" ' os.system(snakecmd_gather) if keep_temp_files: click.echo( gettime() + "Temporary files not deleted since --keep-temp-files was specified." ) else: if (mode == "bulk" or mode == "single"): byefolder = of shutil.rmtree(byefolder + "/.internal") shutil.rmtree(byefolder + "/01_trimmed") shutil.rmtree(byefolder + "/02_aligned_reads") shutil.rmtree(byefolder + "/03_processed_reads") shutil.rmtree(byefolder + "/04_qc") if (trash_mito): shutil.rmtree(byefolder + "/mito") click.echo(gettime() + "Intermediate files successfully removed.") click.echo(gettime() + "Complete.")
def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, omit_detail=False): """Pre-processing function for cleaning up the input bam file. Args: Returns: """ # logging the parameter values frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) msg = 'Params:\n' for i in args: msg += "%s = %s \n" % (i, values[i]) logger.info(msg) read_tagger = lambda x: read_tagger_collection(x, method=read_tagger_method) logger.info('filtering input bam') in_bam = pysam.AlignmentFile(filename, 'rb') # unique read bam ubam_fn = os.path.join(out_dir, 'unique.bam') sorted_ubam_fn = os.path.join(out_dir, 'unique.sorted.bam') ubam = pysam.AlignmentFile(ubam_fn, 'wb', template=in_bam) unique_counter = 0 # multi-read bam mbam_fn = os.path.join(out_dir, 'multi.bam') sorted_mbam_fn = os.path.join(out_dir, 'multi.sorted.bam') mbam = pysam.AlignmentFile(mbam_fn, 'wb', template=in_bam) mread_set = set() # do not omit sequences if to filter max_tags if max_tags > 0: omit_detail = False # splitting unique and multi- reads # and add the read taggers we need if not \ (os.path.isfile( os.path.join(out_dir,'unique.sorted.bam') ) and \ os.path.isfile( os.path.join(out_dir,'multi.sorted.bam')) ): #for read in tqdm(in_bam): counter = 0 for read in in_bam: # poor man's progress bar counter += 1 if not counter % 10**6: logger.debug('tagged %i alignments' % counter) read_tag = read_tagger(read) ## skip reads with unassigned tagger if read_tag == -1: continue read.tags += [('RT', read_tag)] ## add the tag ## omit the details in read sequence and quality ## recommended for larger bam because this ## can save some memory/storage for large bams if omit_detail: read.query_sequence = '*' read.query_qualities = '0' if read.is_secondary or (read.has_tag('NH') and read.opt("NH") > 1): try: if read.opt("NH") < max_hits: mbam.write(read) mread_set.add(read.qname) except KeyError: #print read raise Exception('%s: missing NH tag when is_secondary=%s' % (read.qname, read.is_secondary)) else: ubam.write(read) unique_counter += 1 ubam.close() mbam.close() # sorting pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn) os.remove(ubam_fn) pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn) os.remove(mbam_fn) pysam.index(sorted_ubam_fn) pysam.index(sorted_mbam_fn) # log the statistics multi_counter = len(mread_set) logger.info( 'Unique reads = %s; ' % unique_counter + \ 'Multi reads = %s (%.2f %%)' % \ ( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 ) ) else: logger.info( 'found previously sorted tag-bam. checking if need collapsing.') # filter redundant tags if turned on if max_tags > 0: logger.info('collapsing unique') filter_bam_maxtags( os.path.join(out_dir, 'unique.sorted.collapsed.bam'), os.path.join(out_dir, 'unique.sorted.bam'), max_tags) logger.info('collapsing multi') filter_bam_maxtags(os.path.join(out_dir, 'multi.sorted.collapsed.bam'), os.path.join(out_dir, 'multi.sorted.bam'), max_tags) in_bam.close() return
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-i", "--input-file", action="store", type="string", dest="input_file", help="Input alignment file in SAM or BAM format") parser.add_option("-r", "--refgene", action="store", type="string", dest="refgene_bed", help="Reference gene model in bed fomat.") parser.add_option( "-s", "--sample-size", action="store", type="int", dest="sample_size", default=200000, help="Number of reads sampled from SAM/BAM file. default=%default") parser.add_option( "-q", "--mapq", action="store", type="int", dest="map_qual", default=30, help= "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default" ) parser.add_option("-o", "--out", action="store", type="string", dest="output_file", default="infer_result", help=" default=%infer_result") (options, args) = parser.parse_args() if not (options.input_file and options.refgene_bed): parser.print_help() print('\n\n' + __doc__, file=sys.stderr) sys.exit(0) for f in (options.input_file, options.refgene_bed): if not os.path.exists(f): print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr) sys.exit(0) if options.sample_size < 1000: print("Warn: Sample Size too small to give a accurate estimation", file=sys.stderr) pysam.index(options.input_file) obj = SAM.ParseBAM(options.input_file) (protocol, sp1, sp2, other) = obj.configure_experiment(refbed=options.refgene_bed, sample_size=options.sample_size, q_cut=options.map_qual) if other < 0: other = 0.0 file_object = open(options.output_file + ".txt", "w") if protocol == "PairEnd": file_object.write("This is Paired End Data\n") file_object.write("Fraction of reads failed to determine: %.4f" % other + "\n") file_object.write( "Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1 + "\n") file_object.write( "Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2 + "\n") if sp1 > 2 * sp2: file_object.write( "\nExperiment is likely \"1++,1--,2+-,2-+\" (HTSeq.count --forward)\n" ) if sp2 > 2 * sp1: file_object.write( "\nExperiment is likely \"1+-,1-+,2++,2--\" (HTSeq.count --reverse)\n" ) elif protocol == "SingleEnd": file_object.write("This is Single End Data\n") file_object.write("Fraction of reads failed to determine: %.4f" % other + "\n") file_object.write("Fraction of reads explained by \"++,--\": %.4f" % sp1 + "\n") file_object.write("Fraction of reads explained by \"+-,-+\": %.4f" % sp2 + "\n") if sp1 > 2 * sp2: file_object.write( "\nExperiment is likely \"++,--\" (HTSeq.count --forward)\n") if sp2 > 2 * sp1: file_object.write( "\nExperiment is likely \"+-,-+\" (HTSeq.count --reverse)\n") else: file_object.write("Unknown Data type\n") #print mesg file_object.close()
def __init__(self, input_file: str = None, genome_database: str = None, output_prefix: str = None, ignore_overlap: bool = True, text_output: bool = False, remove_ccgg: bool = False, min_read_depth: int = 10, max_read_depth: int = 8000, threads: int = 1, verbose: bool = True, min_base_quality: int = 10, min_mapping_quality: int = 10, ATCGmap: bool = False, cg_only: bool = True, ignore_orphans: bool = False, bedgraph_output: bool = False): assert isinstance(input_file, str), 'Path to input file not valid' assert isinstance(text_output, bool), 'Not valid bool' assert isinstance(threads, int), 'Threads must be specified with integer' if output_prefix: assert isinstance(output_prefix, str) self.input_file = input_file try: self.input_bam = pysam.Samfile(input_file, 'rb', require_index=True) except IOError: print('Generating Index File') pysam.index(input_file) self.input_bam = pysam.Samfile(input_file, 'rb', require_index=True) self.text_output = text_output self.output_prefix = output_prefix self.threads = threads self.call_methylation_kwargs = dict( input_file=input_file, genome_database=genome_database, ignore_overlap=ignore_overlap, ignore_orphans=ignore_orphans, remove_ccgg=remove_ccgg, max_read_depth=max_read_depth, min_base_quality=min_base_quality, min_mapping_quality=min_mapping_quality, cg_only=cg_only) self.min_read_depth = min_read_depth self.ATCGmap = ATCGmap self.bedgraph_output = bedgraph_output self.methylation_calling = True self.contigs = self.get_contigs self.completed_contigs = None self.return_queue = None self.pool = None self.verbose = verbose self.output_objects = self.get_output_objects self.methylation_stats = { 'CG_meth': 0, 'CG_all': 0, 'CH_meth': 0, 'CH_all': 0 }
temp_bam1 = outputbam.replace(".qc.bam", ".temp1.bam").replace("/temp/ready_bam/", "/temp/temp_bam/") prefixSM = outdir + "/temp/sparse_matrices/" + sample outputdepth = outdir + "/qc/depth/" + sample + ".depth.txt" # 1) Filter bam files pycall = " ".join([ python, filtclip_py, inputbam, filtlog, mito_genome, proper_paired, NHmax, NMmax ]) + " > " + temp_bam0 os.system(pycall) # 2) Sort the filtered bam file pysam.sort("-o", temp_bam1, temp_bam0) pysam.index(temp_bam1) # See if we have UMIs if (umi_barcode != "" and len(umi_barcode) == 2): umi_extra = " BARCODE_TAG=" + umi_barcode else: umi_extra = "" # 3) (Optional) Remove duplicates if (remove_duplicates == "True"): mdc_long = picardCall + " I=" + temp_bam1 + " O=" + outputbam + " M=" + rmlog + " REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT QUIET=true VERBOSITY=ERROR USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" + umi_extra proc = subprocess.Popen(mdc_long, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) out, err = proc.communicate() # Read data from stdout and stderr
def generate_index_if_needed(filepath): index_file = os.path.abspath(filepath) + '.bai' if not os.path.isfile(index_file): # Index file doesn't exist; generate it pysam.index(filepath, index_file) return True
def main(): parser = argparse.ArgumentParser(description='Run MuTect2.') ### Required parser.add_argument('mutect2', help='Path to MuTect2.') parser.add_argument('ref_genome', help='Path to the reference genome.') parser.add_argument('input_bam', help='Input BAM to process.') parser.add_argument('output_vcf', help='Output VCF.') parser.add_argument('log_file', help='Logging goes here.') ### Arguments specifying input resource files. parser.add_argument( '--intervals', help= 'File of genomic coordinate intervals to call variants against of form <chrom:start-stop>.' ) parser.add_argument( '--interval_str', help= 'Genomic coordinate intervals to call variants against of form <chrom:start-stop>.' ) parser.add_argument( '--panel_of_normals', help='Panel of normals to compare tumor calls against.') parser.add_argument('--dbsnp', help='dbSNP file') parser.add_argument('--cosmic', help='VCF file of COSMIC sites') ### Arguments specifying integer parameter values. parser.add_argument('--interval_padding', help='Amount of padding in bp to add to each interval') # parser.add_argument('--stand_call_conf', help='The minimum phred-scaled confidence threshold at which variants should be called') # parser.add_argument('--stand_emit_conf', help='The minimum phred-scaled confidence threshold at which variants should be emitted') parser.add_argument( '--min_base_quality_score', help='Minimum base quality required to consider a base for calling') parser.add_argument('--initial_tumor_lod', help='Initial LOD threshold for calling tumor variant') parser.add_argument( '--initial_normal_lod', help='Initial LOD threshold for calling normal variant') parser.add_argument('--tumor_lod', help='LOD threshold for calling tumor variant') parser.add_argument('--normal_lod', help='LOD threshold for calling normal non-germline') parser.add_argument( '--dbsnp_normal_lod', help='LOD threshold for calling normal non-variant at dbsnp sites') parser.add_argument( '--active_region_in', help='Use this interval list file as the active regions to process') ### Arguments not unique to MuTect2. parser.add_argument('--min_pruning', help='Minimum support to not prune paths in the graph') parser.add_argument( '--min_dangling_branch_length', help='Minimum length of a dangling branch to attempt recovery') ## JHL Added 12/03/16 parser.add_argument( '--kmerSize', help='Kmer size to use in the read threading assembler') parser.add_argument( '--downsampling_type', help= 'Type of read downsampling to employ at a given locus [NONE, ALL_READS, BY_SAMPLE]' ) parser.add_argument('--max_alternate_alleles', help='Maximum number of alternate alleles to genotype') parser.add_argument( '--dontUseSoftClippedBases', action='store_true', help='If specified, we will not analyze soft clipped bases in the reads' ) args = parser.parse_args() logging.basicConfig(filename=args.log_file, level=logging.INFO) work_bam = "input.bam" work_bai = "input.bam.bai" logging.info("Symlink " + args.input_bam + " with " + work_bam) if os.path.isfile(work_bam): os.remove(work_bam) if os.path.isfile(work_bai): os.remove(work_bai) os.symlink(args.input_bam, work_bam) logging.info("Indexing " + work_bam) pysam.index(work_bam) if args.intervals: logging.info("Including interval list in working directory.") shutil.copyfile(args.intervals, "intervals.interval_list") logging.info("Building command in preparation for invocation.") cmd = build_cmd(args) proc = cmd_caller(cmd)
def tophat_map(gtf, out_dir, prefix, fastq, thread, bw=False, scale=False, gtf_flag=1): ''' 1. Map reads with TopHat2 2. Extract unmapped reads 3. Create BigWig file if needed ''' # tophat2 mapping print('Map reads with TopHat2...') tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 ' if gtf_flag: tophat_cmd += '-G %s ' % gtf tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat') tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + fastq tophat_cmd += ' 2> %s/tophat.log' % out_dir print('TopHat2 mapping command:') print(tophat_cmd) return_code = os.system(tophat_cmd) >> 8 if return_code: sys.exit('Error: cannot map reads with TopHat2!') # extract unmapped reads print('Extract unmapped reads...') unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir) unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir) # create Bigwig file if needed if bw: if which('bedGraphToBigWig') is not None: print('Create BigWig file...') map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir # index bam if not exist if not os.path.isfile(map_bam_fname + '.bai'): pysam.index(map_bam_fname) map_bam = pysam.AlignmentFile(map_bam_fname, 'rb') # extract chrom size file chrom_size_fname = '%s/tophat/chrom.size' % out_dir with open(chrom_size_fname, 'w') as chrom_size_f: for seq in map_bam.header['SQ']: chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN'])) if scale: # scale to HPB mapped_reads = map_bam.mapped for read in map_bam: read_length = read.query_length break s = 1000000000.0 / mapped_reads / read_length else: s = 1 map_bam = pybedtools.BedTool(map_bam_fname) bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir with open(bedgraph_fname, 'w') as bedgraph_f: for line in map_bam.genome_coverage(bg=True, g=chrom_size_fname, scale=s, split=True): value = str(int(float(line[3]) + 0.5)) bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value) bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir return_code = os.system('bedGraphToBigWig %s %s %s' % (bedgraph_fname, chrom_size_fname, bigwig_fname)) >> 8 if return_code: sys.exit('Error: cannot convert bedGraph to BigWig!') else: print('Could not find bedGraphToBigWig, so skip this step!')
def getsamplecoverage(currsample, sampledata, genelist,geneseqs,maxmismatches = None, minextend = None): currbam = sampledata.getbam(currsample) allcoverages = dict() multaminocoverages = dict() multaccoverages = dict() multtrnacoverages = dict() uniquecoverages = dict() uniquegenomecoverages = dict() multigenomecoverages = dict() #print >>sys.stderr, trnalist readmismatches = dict() adeninemismatches = dict() thyminemismatches = dict() cytosinemismatches = dict() guanosinemismatches = dict() readstarts = dict() readends = dict() readskips = dict() trimreadcoverage = dict() trimreadmismatches = dict() readcounts = dict() skipped = 0 total = 0 try: #print >>sys.stderr, currbam if not os.path.isfile(currbam+".bai"): pysam.index(""+currbam) bamfile = pysam.Samfile(""+currbam, "rb" ) except IOError as ( strerror): print >>sys.stderr, strerror sys.exit() for i, currfeat in enumerate(genelist): #if currfeat.name != "FEATURE399_minus_145255": # continue allcoverages[currfeat.name] = readcoverage(genelist[i]) uniquegenomecoverages[currfeat.name] = readcoverage(genelist[i]) multigenomecoverages[currfeat.name] = readcoverage(genelist[i]) readstarts[currfeat.name] = readcoverage(genelist[i]) readends[currfeat.name] = readcoverage(genelist[i]) readmismatches[currfeat.name] = readcoverage(genelist[i]) adeninemismatches[currfeat.name] = readcoverage(genelist[i]) thyminemismatches[currfeat.name] = readcoverage(genelist[i]) cytosinemismatches[currfeat.name] = readcoverage(genelist[i]) guanosinemismatches[currfeat.name] = readcoverage(genelist[i]) readskips[currfeat.name] = readcoverage(genelist[i]) trimreadcoverage[currfeat.name] = readcoverage(genelist[i]) trimreadmismatches[currfeat.name] = readcoverage(genelist[i]) readcounts[currfeat.name] = 0 #print >>sys.stderr, trnalist[i] for currread in getbam(bamfile, genelist[i]): if maxmismatches is not None and currread.getmismatches() > maxmismatches: continue #print >>sys.stderr, "||**||"+str(currread.getmismatches()) if genelist[i].coverage(currread) > 10 and genelist[i].strand == currread.strand: if minextend is not None and not (currread.start + minextend <= trnalist[i].start or currread.end - minextend >= trnalist[i].end): continue total += 1 trnaname = genelist[i].name readstart = currread.getfirst(1) readend = currread.getlast(1) readcounts[trnaname] += 1 allcoverages[genelist[i].name].addread(currread) readstarts[trnaname].addread(readstart) readends[trnaname].addread(readend ) if currread.issinglemapped(): uniquegenomecoverages[genelist[i].name].addread(currread) else: multigenomecoverages[genelist[i].name].addread(currread) currseq = currread.getseq() geneseq = geneseqs[genelist[i].name] #if currread.start < currfeat.start or currread.end < currfeat.end or genelist[i].strand == "+": #and currread.end > currfeat.end #pass if genelist[i].strand == "+": genestart = max([0,currread.start - currfeat.start]) geneend = genestart+cigarreflength(currread.getcigar()) - max([0,currfeat.start - currread.start]) #geneend = min([genestart+cigarreflength(currread.getcigar()) + 1]) #minimum of either the else: genestart = max([0,currfeat.end - currread.end]) geneend = genestart+cigarreflength(currread.getcigar()) - max([0,currread.end - currfeat.end]) #geneend = min([genestart+len(currseq) + 1 ,genestart+ cigarreadlength(currread.getcigar()) - 1 ]) refseq = geneseq[genestart:geneend] #print >>sys.stderr, genestart #print >>sys.stderr, geneend #print >>sys.stderr, currread.getcigar() # #print >>sys.stderr, cigarreadlength(currread.getcigar()) #print >>sys.stderr, len(currseq) #print >>sys.stderr, refseq if genelist[i].strand == "+": readstart = max([0,currfeat.start - currread.start]) #readend = readstart + cigarreflength(currread.getcigar()) readend = min([readstart +len(refseq) ,readstart + cigarreflength(currread.getcigar())]) else: readstart = max([0,currread.end - currfeat.end]) #readend = readstart + cigarreflength(currread.getcigar()) readend = min([readstart +len(refseq) ,readstart + cigarreflength(currread.getcigar())]) #print >>sys.stderr, readcov readcov = list(cigarrefcoverage(currread.getcigar())) if genelist[i].strand == "-": readcov = list(cigarrefcoverage(reversed(currread.getcigar()))) #readcov = reversed(readcov) pass alignseq = "".join(currseq[sum(readcov[0:i])] if readcov[i] > 0 else "-" for i in range(cigarreflength(currread.getcigar()))) alignseq = alignseq[readstart:readend] #print >>sys.stderr, alignseq # #refseq = "" #if genestart < 0: # refset = refseq + ("-"*(-genestart)) # genestart = 0 # #if geneend >= len(geneseqs[currfeat.name]): # # geneend = len(geneseqs[currfeat.name]) - 1 #refseq = refseq + geneseqs[currfeat.name][genestart:geneend] #if currread.name == "NB501427:156:H2F7MAFXY:3:21609:5222:18520": #if len(refseq) != 0: #print >>sys.stderr, currread.getcigar() #print >>sys.stderr, currfeat.start #print >>sys.stderr, currread.start #print >>sys.stderr, genestart #print >>sys.stderr, currfeat.name #print >>sys.stderr, currfeat.length() #print >>sys.stderr, alignseq #print >>sys.stderr, refseq #if cigarreflength(currread.getcigar()) < cigarreadlength(currread.getcigar()): # print >>sys.stderr, currread.name # print >>sys.stderr, currread.getcigar() # print >>sys.stderr, "".join(str(curr) for curr in readcov) # print >>sys.stderr, alignseq # print >>sys.stderr,refseq # print >>sys.stderr, currseq #if len(refseq) != len(currseq): # print >>sys.stderr, currread.name # print >>sys.stderr,refseq # print >>sys.stderr, currseq skipends = True #need to check mismatches later #if currread.name == "NB501427:404:HJ3WGAFX2:3:11602:17144:17931": #if refseq != alignseq and len(currread.getcigar()) > 1 and genelist[i].strand == "-":# or cigarreflength(currread.getcigar()) != len(refseq): # #skipped += 1 # print >>sys.stderr, currread.name # print >>sys.stderr, currread.getcigar() # print >>sys.stderr, genelist[i].strand # print >>sys.stderr,currseq # print >>sys.stderr, "gene: "+str(currfeat.start)+"-"+str(currfeat.end) # print >>sys.stderr, "read:" +str(currread.start)+"-"+str(currread.end) # print >>sys.stderr, "gene: "+str(genestart)+"-"+str(geneend) # print >>sys.stderr, "read: " +str(readstart)+"-"+str(readend) # print >>sys.stderr,refseq # print >>sys.stderr, alignseq # pass # #continue #else: # #continue # pass for currpos in range(len(refseq)): #30 currgenomepos = currread.start + currpos if currfeat.strand == "-": currgenomepos = currread.end - currpos if currpos < 0 or currpos >= len(refseq): #print >>sys.stderr, currread.name #print >>sys.stderr, currpos #print >>sys.stderr, len(refseq) #print >>sys.stderr, cigarreflength(currread.getcigar()) # #print >>sys.stderr,refseq #print >>sys.stderr, currseq pass if skipends: if currpos < 3 or currpos > len(refseq) - 3 : #continue pass currbase = alignseq[currpos] refbase = refseq[currpos] if refbase not in gapchars: if refbase != currbase: #if (currpos + currread.start) - readmismatches[trnaname].region.start < 0: # print >>sys.stderr, "before start: "+str(currpos)+"+"+str(currread.start) +"-"+str(readmismatches[trnaname].region.start) # #base - self.region.start #print >>sys.stderr, alignseq #print >>sys.stderr, refseq #print >>sys.stderr, ("-"*currpos)+"*"+("-"*(len(refseq)-currpos - 1)) #print >>sys.stderr, str(currread.start + currpos) readmismatches[trnaname].addbase(currgenomepos) #allcoverages[trnaname].addbase(currread.start + currpos) #allcoverages[genelist[i].name].addbase(currread.start + currpos) #if currpos > 3: # trimreadcoverage[trnaname].addbase(currread.start + currpos) # if refbase != currbase: # trimreadmismatches[trnaname].addbase(currread.start + currpos) if currbase == "-": readskips[trnaname].addbase(currgenomepos) if currbase == "A": adeninemismatches[trnaname].addbase(currgenomepos) elif currbase == "T": thyminemismatches[trnaname].addbase(currgenomepos) elif currbase == "C": cytosinemismatches[trnaname].addbase(currgenomepos) elif currbase == "G": guanosinemismatches[trnaname].addbase(currgenomepos) #print >>sys.stderr, currsample+":" +str(skipped)+"/"+str(total)+":"+str(((1.*skipped)/total)) return coverageinfo( readcounts, allcoverages,readstarts, readends,multaminocoverages, multaccoverages, multtrnacoverages,uniquecoverages, uniquegenomecoverages,multigenomecoverages, readmismatches,adeninemismatches,thyminemismatches,cytosinemismatches, guanosinemismatches,readskips,trimmismatches = trimreadmismatches, trimcoverage = trimreadcoverage )
def PE_Mod(PE, strand, path_bam, output_path): if PE == 'True': #print "\nProcessing PE bam file...may take some time" base_name = os.path.basename(path_bam) out_name = output_path + "/" + base_name.split( ".bam")[0] + ".sorted_PE.bam" infile = pysam.AlignmentFile(path_bam, "rb") sam_strng = subprocess.getstatusoutput('samtools') temp = sam_strng[1].split("Version:")[1][:7] sam_version = int(''.join(list(filter(str.isdigit, temp)))) if sam_version == 119: outfile = pysam.AlignmentFile( output_path + "/" + base_name.split(".bam")[0] + "_temp.bam", "wb", template=infile) else: outfile = pysam.AlignmentFile( output_path + "/" + base_name.split(".bam")[0] + "_temp.bam", "wb", add_sam_header=True, template=infile) for read in infile: Flag = read.flag if strand == 'second': if (Flag == 145 or Flag == 147 or Flag == 153 or Flag == 97 or Flag == 99 or Flag == 73): if Flag < 100: read.flag = 145 read.set_tag("XS", '-', replace=True) outfile.write(read) if (Flag == 161 or Flag == 163 or Flag == 81 or Flag == 83 or Flag == 89 or Flag == 137): if Flag < 100: read.flag = 161 read.set_tag("XS", '+', replace=True) outfile.write(read) if strand == 'first': if (Flag == 145 or Flag == 147 or Flag == 153 or Flag == 97 or Flag == 99 or Flag == 73): if Flag > 100: read.flag = 97 read.set_tag("XS", '+', replace=True) outfile.write(read) if (Flag == 161 or Flag == 163 or Flag == 81 or Flag == 83 or Flag == 89 or Flag == 137): if Flag > 100: read.flag = 81 read.set_tag("XS", '-', replace=True) outfile.write(read) infile.close() outfile.close() #print "Sorting PE bam file" if sam_version == 119: pysam.sort( "-o", out_name, output_path + "/" + base_name.split(".bam")[0] + "_temp.bam") else: pysam.sort( "-O", "BAM", "-T", output_path + "/" + base_name.split(".bam")[0] + "_temp.sorted", "-o", out_name, output_path + "/" + base_name.split(".bam")[0] + "_temp.bam") pysam.index(out_name) os.remove(output_path + "/" + base_name.split(".bam")[0] + "_temp.bam") if PE == 'False': out_name = path_bam return out_name
def main(inputs, output, bam_file, strand_specific, library, protocol, median_fragment_size, stdev_fragment_size, read_length, reference_genome, annotations, masking, aligner_reference, start_time=int(time.time()), **kwargs): """ Args: inputs (list): list of input files containing the breakpoint pairs output (str): path to the output directory bam_file (str): path the bam file strand_specific (bool): flag to indicate the input bam is using a strand specific protocol median_fragment_size (int): the median fragment size stdev_fragment_size (int): the standard deviation in fragment size read_length (int): read length reference_genome (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genome` annotations (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genes` masking (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_masking_regions` aligner_reference (:class:`~mavis.annotate.file_io.ReferenceFile`): path to the aligner reference file (e.g 2bit file for blat) """ mkdirp(output) # check the files exist early to avoid waiting for errors if protocol == PROTOCOL.TRANS: annotations.load() reference_genome.load() masking.load() validation_settings = {} validation_settings.update(DEFAULTS.items()) validation_settings.update( {k: v for k, v in kwargs.items() if k in DEFAULTS}) validation_settings = MavisNamespace(**validation_settings) raw_evidence_bam = os.path.join(output, 'raw_evidence.bam') contig_bam = os.path.join(output, 'contigs.bam') evidence_bed = os.path.join(output, 'evidence.bed') passed_output_file = os.path.join(output, PASS_FILENAME) passed_bed_file = os.path.join(output, 'validation-passed.bed') failed_output_file = os.path.join(output, 'validation-failed.tab') contig_aligner_fa = os.path.join(output, 'contigs.fa') if validation_settings.aligner == SUPPORTED_ALIGNER.BLAT: contig_aligner_output = os.path.join(output, 'contigs.blat_out.pslx') contig_aligner_log = os.path.join(output, 'contigs.blat.log') elif validation_settings.aligner == SUPPORTED_ALIGNER.BWA_MEM: contig_aligner_output = os.path.join(output, 'contigs.bwa_mem.sam') contig_aligner_log = os.path.join(output, 'contigs.bwa_mem.log') else: raise NotImplementedError('unsupported aligner', validation_settings.aligner) igv_batch_file = os.path.join(output, 'igv.batch') input_bam_cache = BamCache(bam_file, strand_specific) bpps = read_inputs( inputs, add_default={ COLUMNS.cluster_id: None, COLUMNS.stranded: False }, add={ COLUMNS.protocol: protocol, COLUMNS.library: library }, expand_strand=False, expand_orient=True, cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x}) evidence_clusters = [] for bpp in bpps: if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME: try: evidence = GenomeEvidence( bpp.break1, bpp.break2, input_bam_cache, reference_genome.content, opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, stdev_fragment_size=stdev_fragment_size, read_length=read_length, median_fragment_size=median_fragment_size, **dict(validation_settings.items())) evidence_clusters.append(evidence) except ValueError as err: warnings.warn( 'Dropping breakpoint pair ({}) as bad input {}'.format( str(bpp), str(err))) elif bpp.data[COLUMNS.protocol] == PROTOCOL.TRANS: try: evidence = TranscriptomeEvidence( annotations.content, bpp.break1, bpp.break2, input_bam_cache, reference_genome.content, opposing_strands=bpp.opposing_strands, stranded=bpp.stranded, untemplated_seq=bpp.untemplated_seq, data=bpp.data, stdev_fragment_size=stdev_fragment_size, read_length=read_length, median_fragment_size=median_fragment_size, **dict(validation_settings.items())) evidence_clusters.append(evidence) except ValueError as err: warnings.warn('Dropping ({}) as bad input {}'.format( str(bpp), str(err))) else: raise ValueError('protocol error', bpp.data[COLUMNS.protocol]) extended_masks = {} for chrom, masks in masking.content.items( ): # extend masking by read length extended_masks[chrom] = [] for mask in masks: extended_masks[chrom].append( BioInterval(chrom, mask.start - read_length, mask.end + read_length, name=mask.name)) evidence_clusters, filtered_evidence_clusters = filter_on_overlap( evidence_clusters, extended_masks) contig_sequences = {} for i, evidence in enumerate(evidence_clusters): LOG() LOG('({} of {})'.format(i + 1, len(evidence_clusters)), 'gathered evidence for:', evidence.cluster_id, '' if COLUMNS.tracking_id not in evidence.data else '(tracking_id: {})'.format(evidence.tracking_id), time_stamp=True) LOG(evidence, time_stamp=False) LOG('possible event type(s):', BreakpointPair.classify(evidence), time_stamp=False) LOG('outer window regions: {}:{}-{} {}:{}-{}'.format( evidence.break1.chr, evidence.outer_window1[0], evidence.outer_window1[1], evidence.break2.chr, evidence.outer_window2[0], evidence.outer_window2[1]), time_stamp=False) LOG('inner window regions: {}:{}-{} {}:{}-{}'.format( evidence.break1.chr, evidence.inner_window1[0], evidence.inner_window1[1], evidence.break2.chr, evidence.inner_window2[0], evidence.inner_window2[1]), time_stamp=False) evidence.load_evidence(log=LOG) LOG('flanking pairs: {};'.format(len(evidence.flanking_pairs)), 'split reads: {}, {};'.format( *[len(a) for a in evidence.split_reads]), 'half-mapped reads: {}, {};'.format( *[len(a) for a in evidence.half_mapped]), 'spanning-reads: {};'.format(len(evidence.spanning_reads)), 'compatible flanking pairs:', len(evidence.compatible_flanking_pairs), time_stamp=False) evidence.assemble_contig(log=LOG) LOG('assembled {} contigs'.format(len(evidence.contigs)), time_stamp=False) for contig in evidence.contigs: name = 'seq-{}'.format( hashlib.md5(contig.seq.encode('utf-8')).hexdigest()) LOG('>', name, '(size={}; reads={:.0f}; coverage={:.2f})'.format( len(contig.seq), contig.remap_score(), contig.remap_coverage()), time_stamp=False) LOG(contig.seq[:140], time_stamp=False) contig_sequences[name] = contig.seq LOG('will output:', contig_aligner_fa, contig_aligner_output) raw_contig_alignments = align_sequences( contig_sequences, input_bam_cache, reference_genome=reference_genome.content, aligner_fa_input_file=contig_aligner_fa, aligner_output_file=contig_aligner_output, clean_files=validation_settings.clean_aligner_files, aligner=kwargs.get('aligner', validation_settings.aligner), aligner_reference=aligner_reference.name[0], aligner_output_log=contig_aligner_log, blat_min_identity=kwargs.get('blat_min_identity', validation_settings.blat_min_identity), blat_limit_top_aln=kwargs.get('blat_limit_top_aln', validation_settings.blat_limit_top_aln), log=LOG) for evidence in evidence_clusters: select_contig_alignments(evidence, raw_contig_alignments) LOG('alignment complete', time_stamp=True) event_calls = [] total_pass = 0 write_bed_file( evidence_bed, itertools.chain.from_iterable( [e.get_bed_repesentation() for e in evidence_clusters])) validation_counts = {} for index, evidence in enumerate(evidence_clusters): LOG() LOG('({} of {}) calling events for: {} {} (tracking_id: {})'.format( index + 1, len(evidence_clusters), evidence.cluster_id, evidence.putative_event_types(), evidence.tracking_id), time_stamp=True) LOG('source:', evidence) calls = [] failure_comment = None try: calls = call_events(evidence) event_calls.extend(calls) except UserWarning as err: LOG('warning: error in calling events', repr(err)) failure_comment = str(err) if not calls: failure_comment = [ 'zero events were called' ] if failure_comment is None else failure_comment evidence.data[COLUMNS.filter_comment] = failure_comment filtered_evidence_clusters.append(evidence) else: total_pass += 1 LOG('called {} event(s)'.format(len(calls)), time_stamp=True) for call in calls: LOG(call) if call.call_method == CALL_METHOD.CONTIG: LOG('\t{} {} [{}] contig_alignment_score: {}, contig_alignment_mq: {} contig_alignment_rank: {}' .format(call.event_type, call.call_method, call.contig_alignment.query_name, round(call.contig_alignment.score(), 2), tuple(call.contig_alignment.mapping_quality()), tuple(call.contig_alignment.alignment_rank()))) LOG('\talignment:', call.contig_alignment.alignment_id()) elif call.contig_alignment: LOG( '\t{} {} alignment:'.format(call.event_type, call.call_method), call.contig_alignment.alignment_id()) else: LOG('\t{} {}'.format(call.event_type, call.call_method), time_stamp=False) validation_counts[call.cluster_id] = validation_counts.get( call.cluster_id, 0) + 1 call.data[COLUMNS.validation_id] = '{}-v{}'.format( call.cluster_id, validation_counts[call.cluster_id]) LOG('\tremapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]' ', flanking pairs: {}{}'.format( 0 if not call.contig else len(call.contig.input_reads), len(call.spanning_reads), len(call.break1_split_read_names()), len(call.break1_split_read_names(tgt=True)), len(call.break2_split_read_names()), len(call.break2_split_read_names(tgt=True)), len(call.linking_split_read_names()), len(call.flanking_pairs), '' if not call.has_compatible else '(' + str(len(call.compatible_flanking_pairs)) + ')')) # write the output validated clusters (split by type and contig) for i, call in enumerate(event_calls): b1_homseq = None b2_homseq = None try: b1_homseq, b2_homseq = call.breakpoint_sequence_homology( reference_genome.content) except AttributeError: pass call.data.update({ COLUMNS.break1_homologous_seq: b1_homseq, COLUMNS.break2_homologous_seq: b2_homseq, }) LOG('{} putative calls resulted in {} events with 1 or more event call'. format(len(evidence_clusters), total_pass), time_stamp=True) output_tabbed_file(event_calls, passed_output_file) output_tabbed_file(filtered_evidence_clusters, failed_output_file) write_bed_file( passed_bed_file, itertools.chain.from_iterable( [e.get_bed_repesentation() for e in event_calls])) if validation_settings.write_evidence_files: with pysam.AlignmentFile(contig_bam, 'wb', template=input_bam_cache.fh) as fh: LOG('writing:', contig_bam, time_stamp=True) for evidence in evidence_clusters: for contig in evidence.contigs: for aln in contig.alignments: aln.read1.cigar = _cigar.convert_for_igv( aln.read1.cigar) fh.write(aln.read1) if aln.read2: aln.read2.cigar = _cigar.convert_for_igv( aln.read2.cigar) fh.write(aln.read2) # write the evidence with pysam.AlignmentFile(raw_evidence_bam, 'wb', template=input_bam_cache.fh) as fh: LOG('writing:', raw_evidence_bam, time_stamp=True) reads = set() for evidence in evidence_clusters: reads.update(evidence.supporting_reads()) for read in reads: read.cigar = _cigar.convert_for_igv(read.cigar) fh.write(read) # now sort the contig bam sort = re.sub(r'.bam$', '.sorted.bam', contig_bam) LOG('sorting the bam file:', contig_bam, time_stamp=True) pysam.sort('-o', sort, contig_bam) contig_bam = sort LOG('indexing the sorted bam:', contig_bam) pysam.index(contig_bam) # then sort the evidence bam file sort = re.sub(r'.bam$', '.sorted.bam', raw_evidence_bam) LOG('sorting the bam file:', raw_evidence_bam, time_stamp=True) pysam.sort('-o', sort, raw_evidence_bam) raw_evidence_bam = sort LOG('indexing the sorted bam:', raw_evidence_bam) pysam.index(raw_evidence_bam) # write the igv batch file with open(igv_batch_file, 'w') as fh: LOG('writing:', igv_batch_file, time_stamp=True) fh.write('load {} name="{}"\n'.format(passed_bed_file, 'passed events')) fh.write('load {} name="{}"\n'.format(contig_bam, 'aligned contigs')) fh.write('load {} name="{}"\n'.format(evidence_bed, 'evidence windows')) fh.write('load {} name="{}"\n'.format(raw_evidence_bam, 'raw evidence')) fh.write('load {} name="{} {} input"\n'.format( bam_file, library, protocol))
clustered = False for cluster in clusters: if chr1 == cluster.chr1 and chr2 == cluster.chr2 and abs(pos1 - cluster.pos1) < rlen and abs(pos2 - cluster.pos2) < isize: cluster.pos1 = (pos1 + cluster.pos1)/2 cluster.pos2 = (pos2 + cluster.pos2)/2 cluster.number += 1 clustered = True break if not clustered: cluster = Cluster(chr1, pos1, chr2, pos2, 1) clusters.append(cluster) discordant_bam.close() pysam.sort("discordant.bam", out_dir+"/discordant") pysam.index("discordant.bam") supp_cluster_num = 0 for cluster in clusters: if cluster.number > 2: print >> sys.stderr, "Cluster:", cluster.tostring() supp_cluster_num += 1 #--- output dna supp fusion only ---# #if supp_cluster_num: # print "Fusion:", line.strip(), supp_cluster_num #--- output all fusions ---# fusion.dnasupp = supp_cluster_num print fusion.tostring() #print "Fusion:", line.strip(), supp_cluster_num #print >> sys.stderr, script_dir + "/cluster2.sh " + out_dir + "/discordant.bam header " + out_dir
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30], onefeature=None, tmpdir=None): global TMP if(tmpdir<>None): if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)): TMP = tmpdir else: print 'ERROR: temporary directory '+tmpdir+' does not exist.' print ' Exiting' sys.exit(1) if(not (os.path.isdir(outdir) or os.path.islink(outdir))): print 'WARNING: '+outdir+' does not exist. Creating directory.' os.mkdir(outdir) if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))): print 'Creating '+outdir+'/data' os.mkdir(outdir+'/data') if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))): print 'Creating '+outdir+'/img' os.mkdir(outdir+'/img') sortedbams = [] for bamfilename in bamfilenames: filelink = TMP+'/'+os.path.basename(bamfilename) try: os.symlink(bamfilename, filelink) except OSError: print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.' print ' Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.' print ' Continue (y/n)?' goahead = raw_input() if(goahead=='n' or goahead=='N'): print 'Exiting...' sys.exit(1) elif(goahead<>'y' and goahead<>'Y'): print 'Unknown choice '+goahead print 'Exiting...' sys.exit(1) if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')): os.remove(filelink) os.symlink(bamfilename, filelink) print 'Indexing...' pysam.index(filelink) print ' Done.' if(not bam_file.bam_file(filelink).issorted()): print 'WARNING: '+bamfilename+' is not sorted' print 'Sorting...' pid = str(time.time()) newsortedbam = TMP+'/'+pid+'.sorted' sortedbams.append(newsortedbam+'.bam') pysam.sort(filelink, newsortedbam) print 'Indexing...' pysam.index(sortedbams[-1]) print ' Done.' else: sortedbams.append(filelink) if(saturation and depthlist=='auto'): maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams]) depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0) depthlist = depthlist/1000000.0 legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames] executiongranted = multiprocessing.Semaphore(nthreads) if(extend<>None): bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed') bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename) else: bedfilename = originalbedfilename if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'): Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted) if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted) else: coverage_saturation_status = None saturationslopes = None if(onefeature==None or onefeature=='specificity'): Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted) for i in range(len(Pcoveragebeds)): Pcoveragebeds[i].join() Pcoveragebeds[i].terminate() if(onefeature==None or onefeature=='specificity'): Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted) if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='percbases'): Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted) if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted) if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted) else: coveragecorr_status = None corr = None if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted) if((reference<>None and onefeature==None) or onefeature=='gcbias'): Pgcbias = [] for i,coveragefile in enumerate(coveragefiles): onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted) Pgcbias.append(onePgcbias) for onePgcbias in Pgcbias: onePgcbias.join() onePgcbias.terminate() else: gcbias_status = None # LAUNCH BASIC STATS if((saturation and onefeature==None) or onefeature=='saturation'): Psaturation.join() Psaturation.terminate() if(onefeature==None or onefeature=='coveragefreq'): Pcoveragedistribution.join() Pcoveragedistribution.terminate() if(onefeature==None or onefeature=='percbases'): Pcoveredpositions.join() Pcoveredpositions.terminate() if(onefeature==None or onefeature=='coveragedistr'): Pcoveragethroughtarget.join() Pcoveragethroughtarget.terminate() if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')): Pcoveragecorr.join() Pcoveragecorr.terminate() if(onefeature==None or onefeature=='coveragestd'): Pcoveragestd.join() Pcoveragestd.terminate() if(onefeature==None or onefeature=='specificity'): Ponoff_reads.join() Ponoff_reads.terminate() Poffclusters.join() Poffclusters.terminate() # if(onefeature==None or onefeature<>'saturation'): # for coveragefile in coveragefiles: # os.remove(coveragefile) if(onefeature==None): generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes, onoff_status, duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage, coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget, reference,nthreads,depthlist, coveragethresholds)
def parseJunctionEntries(bam_dir, multi=False, Species=None, ReferenceDir=None): global bam_file global splicesite_db global IndicatedSpecies global ExonReference IndicatedSpecies = Species ExonReference = ReferenceDir bam_file = bam_dir splicesite_db = {} chromosomes_found = {} start = time.time() try: import collections junction_db = collections.OrderedDict() except Exception: try: import ordereddict junction_db = ordereddict.OrderedDict() except Exception: junction_db = {} original_junction_db = copy.deepcopy(junction_db) bamf = pysam.Samfile(bam_dir, "rb") ### Is there are indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0], entry.cigar) break except Exception: ### Make BAM Index if multi == False: print 'Building BAM index file for', bam_dir bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir) bamf = pysam.Samfile(bam_dir, "rb") chromosome = False barcode_pairs = {} bam_reads = 0 count = 0 jid = 1 prior_jc_start = 0 import Bio from Bio.Seq import Seq l1 = None l2 = None o = open(string.replace(bam_dir, '.bam', '.export2.txt'), "w") spacer = 'TGGT' for entry in bamf.fetch(): #if entry.query_name == 'M03558:141:GW181002:1:2103:13361:6440': if spacer in entry.seq: if entry.seq.index(spacer) == 14: viral_barcode = entry.seq[:48] try: mate = bamf.mate(entry) mate_seq = Seq(mate.seq) cell_barcode = str(mate_seq.reverse_complement())[:16] if (viral_barcode, cell_barcode) not in barcode_pairs: o.write(viral_barcode + '\t' + cell_barcode + '\n') barcode_pairs[viral_barcode, cell_barcode] = [] if 'ATAGCGGGAACATGTGGTCATGGTACTGACGTTGACACGTACGTCATA' == viral_barcode: print entry.query_name, cell_barcode, mate_seq except: pass #print viral_barcode, mate.seq;sys.exit() count += 1 #if count==100: sys.exit() bamf.close() o.close()
def main(mappings_sorted_bam, canonical_chr, mappings_sorted_bai=None): # # SECTION: Download inputs # -------------------------------------------------------------------------- # mappings_sorted_bam and mappings_sorted_bai are passed to the main function # as parameters for our job. mappings_sorted_bam and mappings_sorted_bai are # dictionary objects with key=dnanexus_link and value=<file-id>. # # We handle file objects from the platform by first creating a DXFile handler. # Then performing dxpy.download_dxfile. # # If index file is not supplied *.bai index will be created with pysam.index # # DXFIle.name attribute is converted to ASCII since Pysam does not handle Unicode strings. # print(mappings_sorted_bai) print(mappings_sorted_bam) mappings_sorted_bam = dxpy.DXFile(mappings_sorted_bam) sorted_bam_name = mappings_sorted_bam.name dxpy.download_dxfile(mappings_sorted_bam.get_id(), sorted_bam_name) ascii_bam_name = unicodedata.normalize( # Pysam requires ASCII not Unicode string. 'NFKD', sorted_bam_name).encode('ascii', 'ignore').decode('ascii') if mappings_sorted_bai is not None: mappings_sorted_bai = dxpy.DXFile(mappings_sorted_bai) dxpy.download_dxfile(mappings_sorted_bai.get_id(), mappings_sorted_bai.name) else: pysam.index(ascii_bam_name) # # SECTION: Get chromosomes regions # -------------------------------------------------------------- # Generate Pysam Alignmentfile object. # # Obtain regions to count. mappings_obj = pysam.AlignmentFile(ascii_bam_name, "rb") regions = get_chr(mappings_obj, canonical_chr) # # SECTION: Perform basic pysam count. # -------------------------------------------------------------- # Iterate over regions and sum results of pysam.count(). total_count = 0 count_filename = "{bam_prefix}_counts.txt".format( bam_prefix=ascii_bam_name[:-4]) with open(count_filename, "w") as f: for region in regions: temp_count = mappings_obj.count(region=region) f.write("{region_name}: {counts}\n".format( region_name=region, counts=temp_count)) total_count += temp_count f.write("Total reads: {sum_counts}".format(sum_counts=total_count)) # # SECTION:Output # ---------------------------------------------------------------------------- # Upload generated count file as counts_txt output specified in the dxapp.json counts_txt = dxpy.upload_local_file(count_filename) output = {} output["counts_txt"] = dxpy.dxlink(counts_txt) return output
with pysam.AlignmentFile(options.input, in_mode) as INPUT, pysam.AlignmentFile( options.output, out_mode, header=header) as OUTPUT: header_dict = {} n_header = 0 for header in OUTPUT.header["SQ"]: header_dict[header['SN']] = n_header n_header += 1 # print len(header_dict) for segment in INPUT: segment_output = map_to_genome(segment) if segment_output: OUTPUT.write(segment_output) sys.stderr.write( "[%s]Finished.\n Total: %d\n Lifted: %d\n Unlifted: %d\n\n" % (strftime("%Y-%m-%d %H:%M:%S", time.localtime()), total, lifted, unlifted)) if options.sort == True: sys.stderr.write("[%s]Sorting bam...\n" % strftime("%Y-%m-%d %H:%M:%S", time.localtime())) pysam.sort("-o", options.output.replace(".bam", ".sorted.bam"), options.output) if options.index == True: sys.stderr.write("[%s]Indexing bam...\n" % strftime("%Y-%m-%d %H:%M:%S", time.localtime())) pysam.index(options.output.replace(".bam", ".sorted.bam")) if options.no_del_bam == False: os.remove(options.output)
bam_writers[len_to_chunk[span]].write(rec) add_cigar_span(ref, span) logger.write("Finished iterating through bam file.\n") # Close the bam files logger.write("\nClosing bam reader...\n") bam_reader.close() logger.write("\nClosing the bam writers...\n") for bam_writer in bam_writers.values(): bam_writer.close() # Index the chunk bam files logger.write("\nIndexing the chunk bam files...\n") for chunk in len_chunks: logger.write("Chunk %s\n" % list(chunk)) pysam.index(chunk_to_bam(chunk)) # Write bedgraph coverage file for original bam file bedtool = pybedtools.BedTool(bam_file) cov = bedtool.genome_coverage(bg=True, split=True) cov.saveas("%s.bedgraph" % out_bedgraph_prefix) # Write bedgraph coverage files for each chunk bam file for easier display in IGV logger.write("\nWriting bedgraph coverage files...\n") def chunk_to_bedgraph(chunk): return "%s_%s_%s.bedgraph" % (out_bedgraph_prefix, chunk[0], chunk[1]) for chunk in len_chunks:
def rewrite_bam(ref_mapping, args): """split bam file using pysam""" bam_watson_handle = pysam.AlignmentFile(args.watson) bam_crick_handle = pysam.AlignmentFile(args.crick) out_handles = {} #create new template header = bam_watson_handle.header header['SQ'] = [{ 'LN': ref_mapping['gene']['length'], 'SN': 'gene' }, { 'LN': ref_mapping['non_gene']['length'], 'SN': 'non_gene' }] contig_index = {'gene': 0, 'non_gene': 1} for item in bam_watson_handle.header['RG']: watson_path = os.path.join(args.output_dir, '%s.watson.tmp' % item['SM']) crick_path = os.path.join(args.output_dir, '%s.crick.tmp' % item['SM']) watson_handle = pysam.AlignmentFile(watson_path, "wb", header=header) crick_handle = pysam.AlignmentFile(crick_path, "wb", header=header) out_handles[item['SM']] = { 'watson': watson_handle, 'crick': crick_handle } i = 0 print 'start splitting Watson reads' for read in bam_watson_handle: i += 1 if not i % 1000000: print 'processed %s reads' % i sample = '_'.join(dict(read.tags)['RG'].split('_')[2:]) handle = out_handles[sample]['watson'] #change read parameters depending on contig try: contig_name, contig_pos, contig_len = ref_mapping[ read.reference_name] except KeyError: print '%s not found, continue nevertheless' % read.reference_name continue read.rname = contig_index[contig_name] read.mrnm = contig_index[contig_name] read.pos += contig_pos if read.is_paired: if read.is_proper_pair: read.pnext += contig_pos handle.write(read) for subdict in out_handles.values(): subdict['watson'].close() i = 0 print 'start splitting Crick reads' for read in bam_crick_handle: i += 1 if not i % 1000000: print 'processed %s reads' % i sample = '_'.join(dict(read.tags)['RG'].split('_')[2:]) handle = out_handles[sample]['crick'] # change read parameters depending on contig try: contig_name, contig_pos, contig_len = ref_mapping[ read.reference_name] except KeyError: print '%s not found, continue nevertheless' % read.reference_name continue read.rname = contig_index[contig_name] read.mrnm = contig_index[contig_name] read.pos += contig_pos if read.is_paired: if read.is_proper_pair: read.pnext += contig_pos handle.write(read) for subdict in out_handles.values(): subdict['crick'].close() if not os.path.exists(os.path.join(args.output_dir, 'bam')): os.mkdir(os.path.join(args.output_dir, 'bam')) for item in bam_watson_handle.header['RG']: watson_tmp = os.path.join(args.output_dir, '%s.watson.tmp' % item['SM']) watson_tmp2 = os.path.join(args.output_dir, '%s.watson.tmp2' % item['SM']) watson_path = os.path.join(args.output_dir, 'bam', '%s.watson.bam' % item['SM']) crick_tmp = os.path.join(args.output_dir, '%s.crick.tmp' % item['SM']) crick_tmp2 = os.path.join(args.output_dir, '%s.crick.tmp2' % item['SM']) crick_path = os.path.join(args.output_dir, 'bam', '%s.crick.bam' % item['SM']) pysam.sort(watson_tmp, '-o', watson_tmp2) pysam.sort(crick_tmp, '-o', crick_tmp2) os.system('samtools calmd -b %s %s > %s 2>/dev/null' % (watson_tmp2, os.path.join(args.output_dir, 'ref.fa'), watson_path)) os.system( 'samtools calmd -b %s %s > %s 2>/dev/null' % (crick_tmp2, os.path.join(args.output_dir, 'ref.fa'), crick_path)) pysam.index(watson_path) pysam.index(crick_path) os.system('rm %s/*.tmp*' % (args.output_dir))
if ext == '.bam': os.remove(sfile) # close all open files and get sam/bam names for key, ofile in ofiles.items(): fname = ofile.name ofile.close() ofiles[key] = [fname] ofiles[key].append(os.path.splitext(ofiles[key][0])[0] + '.bam') ofiles[key].append(os.path.splitext(ofiles[key][0])[0] + '_sorted.bam') # convert each sam file to bam for key, ofile in ofiles.items(): pysam.view('-Sb', ofile[0], '-o', ofile[1], catch_stdout=False) pysam.sort(ofile[1], '-o', ofile[2], catch_stdout=False) pysam.index(ofile[2], catch_stdout=False) # write track to custom tracks file temp = write_track(key, os.path.basename(ofile[2]), colors_dict, sfile, url) tfile.write(temp + '\n') # remove sam file and unsorted bam file os.remove(ofile[0]) os.remove(ofile[1]) # check if public directory exists if os.path.isdir(pubdir): # copy sorted bam and bai files to public directory for _, ofile in ofiles.items(): # sorted bam
def indexed_bam(bam_file, config): if not os.path.exists(bam_file + ".bai"): pysam.index(bam_file) sam_reader = pysam.Samfile(bam_file, "rb") yield sam_reader sam_reader.close()
def get_metagene_tag_count(self, bam, bam_path, transDF, file): ''' Extract tags from bam files :return: ''' index_bam = compare_bam_bai_creationtime(bam_path) if index_bam: try: print('Reindexing bam as bai is older', bam_path) pysam.index(bam_path) except: raise RuntimeError("Error in Bam indexing", bam_path) sample_bam = pysam.Samfile(bam_path, "rb") total_mapped = sample_bam.mapped file.write(bam+'\t'+str(total_mapped)+'\n') distribution_df = pd.DataFrame() distribution_df_norm = pd.DataFrame() #print(transDF.head()) for ind, row in transDF.iterrows(): # reading peaksdf strand = row['strand'] list_sample = [] list_sample_norm = [] Chr = str(row['chr']) start = row['start'] stop = row['stop'] interval = math.ceil((stop-start)/100.0) # 500bp upstream in 10 bins hstart = start - (interval*10) hstop = hstart + interval if start > 0: for i in range(0, 10): # Please set based on distance on one side = s*distance/50 seqcount = sample_bam.count(Chr, hstart, hstop) list_sample.append(seqcount) # count real list_sample_norm.append((seqcount*(5.*10**6)/total_mapped)) # Normalized count per million hstart = hstop hstop = hstart + interval # divide peaks into length of 50 bp # gene body tag retrieval start = start stop = start + interval if start > 0: for i in range(0, 100): # Please set based on distance on one side = s*distance/50 seqcount = sample_bam.count(Chr, start, stop) list_sample.append(seqcount) # count real list_sample_norm.append((seqcount*(5.*10**6)/total_mapped)) # Normalized count per million start = stop stop = start + interval # divide peaks into length of 50 bp # 500bp downstream in 10 bins tstart = stop tstop = tstart + interval if start > 0: for i in range(0, 10): # Please set based on distance on one side = s*distance/50 seqcount = sample_bam.count(Chr, tstart, tstop) list_sample.append(seqcount) # count real list_sample_norm.append((seqcount*(5.*10**6)/total_mapped)) # Normalized count per million tstart = tstop tstop = tstart + interval # divide peaks into length of 50 bp # additional normalization based on permutation test if bam in self.external_sample_norm_factor.keys(): list_sample_norm = [x*self.external_sample_norm_factor.get(bam) for x in list_sample_norm] if (strand == 1) or (strand == '+'): distribution_df = distribution_df.append(pd.Series(list_sample), ignore_index=True) distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True) elif (strand == -1) or (strand == '-'): distribution_df = distribution_df.append(pd.Series(list_sample[::-1]), ignore_index=True) distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm[::-1]), ignore_index=True) else: print('Problem with gene strand information:', row['chr'], '-', row['start']) sample_bam.close() # closing bam file return distribution_df, distribution_df_norm
def main(): # -------------------------------------------------------- # PART 0: Parse input # -------------------------------------------------------- parser = argparse.ArgumentParser(description='Extract and package reads within region') parser.add_argument('-v', '--verbose', action="store_true", default=False, required=False, dest="verbose", help="Use for verbose output with info on progress.") parser.add_argument('-b', '--bam', action="store", required=True, dest="bam", help="Sorted bam file created by aligning reads to the draft genome (refer to reads.sorted.bam in Nanopolish README).") parser.add_argument('-r', '--reads', action="store", dest="fa_filename", help="Fasta, fastq, fasta.gz, or fastq.gz file (refer to reads.fa in Nanopolish README)") parser.add_argument('-g', '--genome', action="store", required=True, dest="draft_ga", help="Draft genome assembly (refer to draft.fa in Nanopolish README).") parser.add_argument('-w', '--window', action="store", required=True, dest="draft_ga_coords", help="Draft genome assembly coordinates wrapped in quotes ex. \"tig000001:10000-20000\".") parser.add_argument('-o', '--output_prefix', action="store", required=False, default="reads_subset", dest="output_prefix", help="Output prefix for tar.gz file and log file.") args = parser.parse_args() # Check to see if user used verbose option global verbose if args.verbose: verbose = True # Infer readdb file from fasta/q file readdb = args.fa_filename + ".index.readdb" custom_print( "===================================================" ) custom_print( "Extract reads that align to given region" ) custom_print( "Package all necessary files to reproduce error" ) custom_print( "===================================================" ) # -------------------------------------------------------- # PART 1: Validate input # -------------------------------------------------------- custom_print( "[ Input ]" ) custom_print( "[+] Extracting from draft genome assembly coords: " + args.draft_ga_coords ) custom_print( "[+] BAM file (reads.fa aligned to draft.fa): " + args.bam ) custom_print( "[+] Readdb file: " + readdb ) custom_print( "[+] Draft genome assembly (draft.fa): " + args.draft_ga ) custom_print( "[+] FASTA/Q file (reads.fa): " + args.fa_filename ) custom_print( "[+] Output prefix: " + args.output_prefix ) custom_print( "[ Input check ]" ) files = list() files.append(args.bam) files.append(readdb) files.append(args.fa_filename) files.append(args.draft_ga) draft_ga_fai = args.draft_ga + ".fai" files.append(draft_ga_fai) for i in files: if not os.path.exists(i) or not os.path.getsize(i) > 0 or not os.access(i, os.R_OK): print( "Expecting " + i + ". But does not exist, is empty or is not readable." ) sys.exit(1) custom_print( "[ Validated input ] All input files exist, are not-empty, and are readable." ) # -------------------------------------------------------- # PART 2: Reassign input argument values # -------------------------------------------------------- # o = old/original, ga = genome assembly, fa = fasta/q file # coords = coordinates, op = output o_bam = args.bam o_readdb = readdb o_fa = args.fa_filename op = args.output_prefix draft_ga_coords = args.draft_ga_coords # -------------------------------------------------------- # PART 3: With user input ref coords, extract all # aligned reads within these coordinates, # store read_ids, and fast5 files. # -------------------------------------------------------- custom_print( "[ Extracting info on reads aligned to region ] \t" + draft_ga_coords ) samfile = pysam.AlignmentFile(o_bam, "rb") region_read_ids = list() region_num_reads = 0 # get all read ids of reads that are aligned to region in draft assembly for read in samfile.fetch(region=draft_ga_coords): id = read.query_name # add to list if not already in list if not id in region_read_ids: # store read id in list region_read_ids.append(id) # count number of reads that were aligned to the given region region_num_reads+=1 # -------------------------------------------------------- # PART 4: Parse readdb file and find path to fast5 files # associated with each read that aligned to region # -------------------------------------------------------- # readdb file has 2 columns: one indicating read_id and another indicating the fast5 file the read came from # each row represents a read custom_print( "[ Reading readdb file ]" ) region_fast5_files = dict() with open (o_readdb, "r") as file: for line in file: l = line.split("\t") read_id = l.pop(0) if read_id in region_read_ids: fast5_file = l.pop(0) region_fast5_files[str(read_id)] = fast5_file.rstrip() # -------------------------------------------------------- # PART 5: Make a region BAM and BAI file # -------------------------------------------------------- new_bam = "reads.bam" custom_print( "[ Writing to a new BAM file ] \t" + new_bam ) region_reads = pysam.view("-b", o_bam, draft_ga_coords, "-o", new_bam, catch_stdout=False) new_bam_index = new_bam + ".bai" custom_print( "[ Writing to a new BAI file ] \t" + new_bam_index ) pysam.index(new_bam, new_bam_index) # -------------------------------------------------------- # PART 6: With user input ref coords, extract all # aligned reads within these coordinates # and make new FASTA file # -------------------------------------------------------- # detect type of sequences file then handle accordingly file_type = detect_fa_filetype(o_fa) new_fa = "reads.fasta" custom_print( "[ Writing to a new fasta file ]\t" + new_fa ) with open (new_fa, "w") as fout: if ".gz" in file_type: with gzip.open(o_fa, "rt") as fin: if "fasta.gz" in file_type: for record in SeqIO.parse(fin, "fasta"): if record.id in region_read_ids: fout.write(">" + record.id + "\n") fout.write(str(record.seq) + "\n") elif "fastq.gz" in file_type: for record in SeqIO.parse(fin, "fastq"): if record.id in region_read_ids: fout.write(">" + record.id + "\n") fout.write(str(record.seq) + "\n") else: with open(o_fa, "rt") as fin: if "fasta" in file_type: for record in SeqIO.parse(fin, "fasta"): if record.id in region_read_ids: fout.write(">" + record.id + "\n") fout.write(str(record.seq) + "\n") elif "fastq" in file_type: for record in SeqIO.parse(fin, "fastq"): if record.id in region_read_ids: fout.write(">" + record.id + "\n") fout.write(str(record.seq) + "\n") # -------------------------------------------------------- # PART 7: Let's get to tarring # -------------------------------------------------------- # While tarring, we need to fix the directory structure # such that the original path to files are not saved. # For each fast5 file we need to extract the basename, # and save it in tar such that we save only the basename, # and not the whole path from the original source. tar_filename = op + ".tar.gz" archive = tarfile.open(tar_filename, "w:gz") custom_print( "[ Creating a tar.gz file ] \t" + tar_filename ) custom_print( "[+] FAST5 files: " + op + "/fast5_files/<FAST5 file(s)>" ) for r in region_fast5_files.keys(): read_id = r f5 = region_fast5_files[r] # get basename of fast5 file f5_basename = extract_basename(f5) an = op + "/fast5_files/" + f5_basename archive.add(f5, arcname=an) # -------------------------------------------------------- # PART 8: Add new files to tar # new fasta, new bam, and new bai with reads # in the region given only # -------------------------------------------------------- an = op + "/" + new_fa archive.add(new_fa, arcname=an) custom_print( "[+] New FASTA: " + an ) an_new_bam = op + "/" + new_bam archive.add(new_bam, arcname=an_new_bam) custom_print( "[+] New BAM: " + an_new_bam ) an_new_bam_index = op + "/" + new_bam_index archive.add(new_bam_index, arcname=an_new_bam_index) custom_print( "[+] New BAI: " + an_new_bam_index ) # -------------------------------------------------------- # PART 9: Add original draft genome assembly file # and the index file # -------------------------------------------------------- an_draft_ga = op + "/draft.fa" archive.add(args.draft_ga, arcname=an_draft_ga) custom_print( "[+] Original draft ga: " + an_draft_ga ) an_draft_ga_fai = op + "/draft.fa.fai" archive.add(i, arcname=an_draft_ga_fai) custom_print( "[+] Original draft ga index: " + an_draft_ga_fai ) # -------------------------------------------------------- # PART 10: Check the number of reads in all new files # -------------------------------------------------------- custom_print( "[ Output check ] " ) # check the length of bam file num_reads_bam = region_num_reads num_reads_fasta = int(float(file_length(new_fa))/2.0) num_fast5_files = len(region_fast5_files) values = list() values.append(num_reads_bam) values.append(num_reads_fasta) custom_print( "[+] Num reads in new BAM: \t" + str(num_reads_bam) ) custom_print( "[+] Num reads in new FASTA: \t" + str(num_reads_fasta) ) custom_print( "[+] Num files in fast5_files/: \t" + str(num_fast5_files)) if not all( v == num_fast5_files for v in values ): print( "[!] WARNING: The number of reads in the new bam, new fasta, and num of fast5 files tarred are not equal..." ) else: custom_print( "[ Validated output ] Number of reads in the new bam, new fasta, and num of fast5 files tarred are equal!" ) # -------------------------------------------------------- # FINAL: Output log if verbose flag not used # -------------------------------------------------------- global log logfile = op + ".log" with open (logfile, "w") as lfile: for s in log: lfile.write(s + "\n") an_logfile = op + "/" + logfile custom_print( "[ Log file ] " + an_logfile ) custom_print( "[ Tar file ] " + str(tar_filename) ) custom_print( "[ Finished ] " ) archive.add(logfile, arcname=an_logfile) archive.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.set_defaults( methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, inplace=False, fastq_pair1=None, fastq_pair2=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) bamfiles = [] if options.stdin != sys.stdin: bamfiles.append(options.stdin.name) if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.Samfile(bamfile, "rb") if bamfile == "-": if options.output_sam: pysam_out = pysam.Samfile("-", "wh", template=pysam_in) else: pysam_out = pysam.Samfile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in) if "filter" in options.methods: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.Samfile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam(pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()