def bamparse((strain, target, bamfile)): """Parses bam files using pysam stats""" global parsedict # Use the stat_baseq_ext (extended base quality statistics) function of pysam stats to return records parsed # from sorted bam files for rec in pysamstats.stat_baseq_ext(alignmentfile=bamfile, fafile=target): # Values of interest can be retrieved using the appropriate keys # Simple filtering statement: if the number of matches at a particular position in the reference sequence is # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results if rec['matches'] > rec['mismatches'] and rec['reads_all'] > 4: # Populate the dictionary with the appropriate values parsedict[strain][target][rec['chrom']][float(rec['pos'])][rec['reads_all']] = rec['rms_baseq'] dotter() return parsedict
def _bowtie(self, raw, db): version = Popen(['samtools', '--version'], stdout=PIPE, stderr=STDOUT).stdout.read().split('\n')[0].split()[1] raw = map(os.path.abspath, raw) if len(raw) == 2: name = lcs(*raw) indict = dict(("m" + str(x), fastq) for x, fastq in enumerate(raw, 1)) else: indict = dict(("U", ",".join(raw))) name = os.path.splitext(raw)[0] # SAMtools sort v1.3 has different run parameters workingdir = name + "tmp" make_path(workingdir) name += ".sorted.bam" if version < "1.3": samsort = SamtoolsSortCommandline(input_bam="-", out_prefix=name) else: samsort = SamtoolsSortCommandline(input_bam=name, o=True, out_prefix="-") indict.update(dict(samtools=[SamtoolsViewCommandline(b=True, S=True, input_file="-"), samsort])) if not os.path.isfile(name): Bowtie2CommandLine(bt2=os.path.splitext(os.path.abspath(db))[0], threads=self.threads, very_sensitive_local=True, a=True, **indict)(cwd=workingdir) if not os.path.isfile(name + ".bai"): SamtoolsIndexCommandline(input_bam=name)(cwd=workingdir) os.rmdir(workingdir) genes = dict() for rec in pysamstats.stat_baseq_ext(alignmentfile=name, fafile=db): # Values of interest can be retrieved using the appropriate keys # Simple filtering statement: if the number of matches at a particular position in the reference sequence is # greater than the number of mismatches, and the total depth is 5 or more, add the position of the results if rec['chrom'] not in genes: genes[rec['chrom']] = dict(identity=1.0, depth=float(rec['reads_all']), quality=0.0) else: genes[rec['chrom']]['identity'] += 1.0 genes[rec['chrom']]['depth'] += float(rec['reads_all']) genes[rec['chrom']]['quality'] += float(rec['rms_baseq']) return genes
def run_pysamstats_baseq(bamFile, refFile, baseq, mapq, sampleName, record): bam_to_process = pysam.AlignmentFile(bamFile) keyorder = [ 'Tumor_Sample_Barcode', 'chrom', 'pos', 'ref', 'alt', 'reads_all', 'reads_fwd', 'reads_rev', 'reads_pp', 'reads_pp_fwd', 'reads_pp_rev', 'matches', 'matches_fwd', 'matches_rev', 'matches_pp', 'matches_pp_fwd', 'matches_pp_rev', 'mismatches', 'mismatches_fwd', 'mismatches_rev', 'mismatches_pp', 'mismatches_pp_fwd', 'mismatches_pp_rev', 'rms_baseq', 'rms_baseq_fwd', 'rms_baseq_rev', 'rms_baseq_pp', 'rms_baseq_pp_fwd', 'rms_baseq_pp_rev', 'rms_baseq_matches', 'rms_baseq_matches_fwd', 'rms_baseq_matches_rev', 'rms_baseq_matches_pp', 'rms_baseq_matches_pp_fwd', 'rms_baseq_matches_pp_rev', 'rms_baseq_mismatches', 'rms_baseq_mismatches_fwd', 'rms_baseq_mismatches_rev', 'rms_baseq_mismatches_pp', 'rms_baseq_mismatches_pp_fwd', 'rms_baseq_mismatches_pp_rev' ] chromosome = record.CHROM position = record.POS ref = record.REF alt = record.ALT[0] for rec in pysamstats.stat_baseq_ext(bam_to_process, refFile, chrom=chromosome, start=position, end=None, min_mapq=mapq, min_baseq=baseq, no_del=False, no_dup=True, one_based=True, truncate=True): rec['alt'] = alt rec['pos'] = position rec['Tumor_Sample_Barcode'] = sampleName rec = collections.OrderedDict( sorted(rec.items(), key=lambda i: keyorder.index(i[0]))) # print "Org:",chromosome,position,ref,alt,rec['chrom'],rec['pos'],rec['ref'],"\n" return (rec)