コード例 #1
0
def genotype_bam(argd):
    """Call genotypes in bam file at the given SNPs. argd is a dictionary of
    arguments
    """
    # Extract reads in panel SNPs
    aln_f= pysam.AlignmentFile(argd['bam'])
    panel_bam= tempfile.NamedTemporaryFile(prefix= re.sub('\.bam$', '', os.path.basename(argd['bam'])) + '.', suffix= '.bam', delete= False, mode= 'w', dir= argd['tmpdir'])
    bname= os.path.splitext(panel_bam.name)[0]
    aln_out_us= pysam.AlignmentFile(panel_bam, 'wb', header= aln_f.header)

    panel= pysam.VariantFile(argd['snp_panel'])
    prev_reads= []
    for snp in panel:
        reads= []
        for read in aln_f.fetch(snp.chrom, snp.start, snp.stop):
            reads.append(read)
        # Remove reads already written
        outreads= [x for x in reads if x not in prev_reads]
        for read in outreads:
            aln_out_us.write(read)
        prev_reads= reads
    aln_f.close()
    aln_out_us.close()
    panel.close()

    # Pileup
    pysam.mpileup('-f', args.ref, '-g', '--min-MQ', str(args.mapq), '-o', bname + '.all.bcf', bname + '.bam', catch_stdout= False)
    bcftools.index(bname + '.all.bcf', catch_stdout= False)
    
    # Call genotypes
    # We use subprocess instead of pysam because of https://github.com/pysam-developers/pysam/issues/693
    cmd= ['bcftools', 'call', '-T', snp_panel, '-m', '--skip-variants', 'indels', '-O', 'z', '-o', bname + '.calls.vcf.gz', bname + '.all.bcf']
    sp= subprocess.Popen(cmd, stderr= subprocess.PIPE)
    stdout, stderr= sp.communicate()
    stderr= stderr.decode().split('\n')
    stderr= '\n'.join([x for x in stderr if 'assuming all sites are diploid' not in x]) # We ignore this warning
    if stderr != '':
        sys.stderr.write(stderr)
    
    if sp.returncode != 0:
        raise Exception('\n%s exit code from \n\n%s\n' % (sp.returncode, ' '.join(cmd)))
    
    # Make tabular format
    with open(bname + '.txt', 'w') as txt:
        txt.write('\t'.join(['chrom', 'pos', 'alt', 'gt', 'qual']) + '\n')
        calls= pysam.VariantFile(bname + '.calls.vcf.gz')
        for x in calls:
            if x.qual < argd['min_gq'] or sum(x.info['DP4']) < argd['min_dp4']:
                continue
            if x.alts is None:
                alt= '.'
            else:
                alt= x.alts[0]
            gt= x.samples[0]['GT']
            if gt[0] is None or gt[1] is None:
                continue
            line= [x.chrom, str(x.pos), alt, str(gt[0]) + "/" + str(gt[1]), str(round(x.qual, 1))]
            txt.write('\t'.join(line) + '\n')
        calls.close()    
    return {'table': bname + '.txt', 'bam': argd['bam']}
コード例 #2
0
ファイル: genecount.py プロジェクト: daaaaande/circs_snake
    def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True):
        # Count linear spliced reads
        # process the circ_coordinates to left circ position and right circ position
        if header:
            coor = open(circ_coor, 'r').readlines()[1:]
        else:
            coor = open(circ_coor, 'r').readlines()
        start_coor = open(self.tmp_dir + 'tmp_start_coor', 'w')
        start_coor_1 = open(self.tmp_dir + 'tmp_start_coor_1', 'w')
        end_coor = open(self.tmp_dir + 'tmp_end_coor', 'w')
        end_coor_1 = open(self.tmp_dir + 'tmp_end_coor_1', 'w')

        for line in coor:
            tmp = line.split('\t')
            start_coor.write(tmp[0] + '\t' + tmp[1] + '\n')
            start_coor_1.write(tmp[0] + '\t' + str(int(tmp[1]) - 1) + '\n')
            end_coor.write(tmp[0] + '\t' + tmp[2] + '\n')
            end_coor_1.write(tmp[0] + '\t' + str(int(tmp[2]) + 1) + '\n')

        # close position files:
        start_coor.close()
        start_coor_1.close()
        end_coor.close()
        end_coor_1.close()
        print ('Started linear spliced read counting for %s' % bamfile)

        # mpileup get the number of spliced reads at circle start position and (start-1) position.

        print ("\t=> running mpileup 1 for start positions [%s]" % bamfile)
        mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_1')

        print ("\t=> running mpileup 2 for start positions [%s]" % bamfile)
        mpileup_start_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_2')

        # mpileup get the number of spliced reads at circle end position and (end+1) position.
        print ("\t=> running mpileup 1 for end positions [%s]" % bamfile)
        mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_1')

        print ("\t=> running mpileup 2 for end positions [%s]" % bamfile)
        mpileup_end_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_2')

        # get count

        print "\t=> gathering read counts for start positions [%s]" % bamfile
        startcount = self.submpileup(self.getreadscount(mpileup_start_1), self.getreadscount(mpileup_start))

        print "\t=> gathering read counts for end positions [%s]" % bamfile
        endcount = self.submpileup(self.getreadscount(mpileup_end), self.getreadscount(mpileup_end_1), left=False)

        # remove tmp files
        # os.remove(self.tmp_dir + 'tmp_start_coor')
        # os.remove(self.tmp_dir + 'tmp_start_coor_1')
        # os.remove(self.tmp_dir + 'tmp_end_coor')
        # os.remove(self.tmp_dir + 'tmp_end_coor_1')

        print 'Finished linear spliced read counting for %s' % bamfile

        return startcount, endcount
コード例 #3
0
ファイル: genecount.py プロジェクト: ZWB821/DCC
    def genecount(self, circ_coordinates, bamfile, ref, tid):
        """
        @circ_coordinates: quoted string, content with format "chr1\tstart\tend"
        @bamfile: quoted string
        @ref: quoted string
        """

        # process the circ_coordinates to left circ position and right circ position
        coordinates = open(circ_coordinates, 'r').readlines()[1:]
        start_coordinates = open(self.tmp_dir + 'tmp_start_coordinates_' + tid,
                                 'w')
        end_coordinates = open(self.tmp_dir + 'tmp_end_coordinates_' + tid,
                               'w')

        for line in coordinates:
            tmp = line.split('\t')
            start_coordinates.write(tmp[0] + '\t' + tmp[1] + '\n')
            end_coordinates.write(tmp[0] + '\t' + tmp[2] + '\n')

        # close position files:
        start_coordinates.close()
        end_coordinates.close()

        print(('Started linear gene expression counting for %s' % bamfile))

        start = time.time()
        # mpileup get the read counts of the start and end positions
        print(("\t=> running mpileup for start positions [%s]" % bamfile))
        mpileup_start = pysam.mpileup(
            bamfile, '-f', ref, '-l',
            self.tmp_dir + 'tmp_start_coordinates_' + tid)
        end = time.time() - start
        print(("\t=> mpileup for start positions for %s took %d seconds" %
               (bamfile, end)))

        start = time.time()
        # mpileup get the read counts of the start and end positions
        print(("\t=> running mpileup for end positions [%s]" % bamfile))
        mpileup_end = pysam.mpileup(
            bamfile, '-f', ref, '-l',
            self.tmp_dir + 'tmp_end_coordinates_' + tid)
        end = time.time() - start
        print(("\t=> mpileup for end positions for %s took %d seconds" %
               (bamfile, end)))

        print("\t=> gathering read counts for start positions [%s]" % bamfile)
        startcount = self.getreadscount(mpileup_start, countmapped=True)

        print("\t=> gathering read counts for end positions [%s]" % bamfile)
        endcount = self.getreadscount(mpileup_end, countmapped=True)

        # remove tmp files
        # os.remove(self.tmp_dir + 'tmp_start_coordinates_' + tid)
        # os.remove(self.tmp_dir + 'tmp_end_coordinates_' + tid)

        print('Finished linear gene expression counting for %s' % bamfile)

        return startcount, endcount
コード例 #4
0
ファイル: genecount.py プロジェクト: Voineagulab/DCC
    def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True):
        # Count linear spliced reads
        # process the circ_coordinates to left circ position and right circ position
        if header:
            coor = open(circ_coor, 'r').readlines()[1:]
        else:
            coor = open(circ_coor, 'r').readlines()
        start_coor = open('start_coor', 'w')
        start_coor_1 = open('start_coor_1', 'w')
        end_coor = open('end_coor', 'w')
        end_coor_1 = open('end_coor_1', 'w')

        for line in coor:
            tmp = line.split('\t')
            start_coor.write(tmp[0] + '\t' + tmp[1] + '\n')
            start_coor_1.write(tmp[0] + '\t' + str(int(tmp[1]) - 1) + '\n')
            end_coor.write(tmp[0] + '\t' + tmp[2] + '\n')
            end_coor_1.write(tmp[0] + '\t' + str(int(tmp[2]) + 1) + '\n')

        # close postion files:
        start_coor.close()
        start_coor_1.close()
        end_coor.close()
        end_coor_1.close()
        print 'Started linear spliced reads count for %s' % bamfile

        # mpileup get the number of spliced reads at circle start position and (start-1) postion.
        mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l',
                                      'start_coor')  # A list object
        mpileup_start_1 = pysam.mpileup(bamfile, '-f', ref, '-l',
                                        'start_coor_1')

        # mpileup get the number of spliced reads at circle end position and (end+1) postion.
        mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor')
        mpileup_end_1 = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor_1')

        # get count
        startcount = self.submpileup(self.getreadscount(mpileup_start_1),
                                     self.getreadscount(mpileup_start))
        endcount = self.submpileup(self.getreadscount(mpileup_end),
                                   self.getreadscount(mpileup_end_1),
                                   left=False)

        # remove tmp files
        os.remove('start_coor')
        os.remove('start_coor_1')
        os.remove('end_coor')
        os.remove('end_coor_1')

        return startcount, endcount
コード例 #5
0
 def findSiteCoverages(self, bamIn, minDepth=40):
     """
     Runs mpileup on each SUN position and finds allele fractions
     """
     bases = set(["A", "T", "G", "C", "a", "t", "g", "c"])
     resultDict = defaultdict(list)
     nVals = []
     for pos, (para, ref, alt, hg38_pos) in self.wl.iteritems():
         posStr = "chr1:{0}-{0}".format(pos)
         pileUp = pysam.mpileup("-q", "20", "-r", posStr, bamIn)
         if len(pileUp) == 0:
             continue
         pileUpStr = pileUp[0].split()
         if len(pileUpStr) != 6:
             continue
         pileUpResult = Counter(x.upper() for x in pileUpStr[4] if x in bases)
         if ref not in pileUpResult or alt not in pileUpResult:
             continue
         if sum(pileUpResult.itervalues()) < minDepth:
             continue
         frac = formatRatio(pileUpResult[alt], sum(pileUpResult.values()))
         # invert fraction for Notch2 paralogs
         if para == "N":
             frac = 1 - frac
             nVals.append(frac)
         else:
             resultDict[para].append([pos, frac])
     norm = self.findNormalizingFactor(nVals)
     for para, result in resultDict.iteritems():
         resultDict[para] = [[x, 2.0 * y / norm] for x, y in result]
     return resultDict
コード例 #6
0
 def _mpileup_grab(self):
     """
     Get information on this locus from samtools mpileup.
     :return:
     """
     return pysam.mpileup(self.inbam, "-f", self.ref_gen, "-r",
                          self.ival).rstrip('\n').split('\t')
コード例 #7
0
def get_consensus_report(name, sam_path, ref_path, is_circular, coverage_threshold=0, report_out_dir=None, tmp_files_dir=None):
    basename = os.path.basename(sam_path)
    file_name, ext = os.path.splitext(basename)

    out_dir = tmp_files_dir
    keep_tmp_files = tmp_files_dir is not None
    if not keep_tmp_files:
        out_dir = tempfile.mkdtemp()

    os.makedirs(out_dir, exist_ok=True)
    tmp_sam_path = os.path.join(out_dir, file_name + '_tmp.sam')
    tmp_bam_path = os.path.join(out_dir, file_name + '_tmp.bam')
    bam_path = os.path.join(out_dir, file_name + '.bam')
    mpileup_path = bam_path + '.bam.mpilup'

    logging.info("Split long aligments")
    split_aligments_in_sam(sam_path, tmp_sam_path)

    logging.info("Converting sam to bam")
    pysam.view('-S', tmp_sam_path, '-b', '-o', tmp_bam_path, catch_stdout=False)

    logging.info("Sorting bam file")
    pysam.sort(tmp_bam_path, '-o', bam_path, catch_stdout=False)

    logging.info("Creating bam index")
    pysam.index(bam_path, '-b')

    logging.info("Creating mpileup")

    mpileup_flags = ['-A', '-B', '-Q', '0']
    if is_circular:
        # use secondary aligments as well
        mpileup_flags.extend(['--ff', '0'])

    pysam.mpileup(*mpileup_flags,
                  '-f', ref_path, bam_path,
                  '-o', mpileup_path, catch_stdout=False)

    logging.info("Generating consensus and report")
    report = process_mpileup(name, sam_path, ref_path, mpileup_path, coverage_threshold, report_out_dir)

    if not keep_tmp_files:
        logging.info("Cleaning tmp files")
        shutil.rmtree(out_dir)

    return report
コード例 #8
0
ファイル: genecount.py プロジェクト: KornfeldLab/DCC
    def linearsplicedreadscount(self,circ_coor,bamfile,ref,header=True):
        # Count linear spliced reads
        # process the circ_coordinates to left circ position and right circ position
        if header:
            coor = open(circ_coor,'r').readlines()[1:]
        else:
            coor = open(circ_coor,'r').readlines()
        start_coor = open('start_coor','w')
        start_coor_1 = open('start_coor_1','w')
        end_coor = open('end_coor','w')
        end_coor_1 = open('end_coor_1','w')
        
        for line in coor:
            tmp = line.split('\t')
            start_coor.write(tmp[0]+'\t'+tmp[1]+'\n')
            start_coor_1.write(tmp[0]+'\t'+str(int(tmp[1])-1)+'\n')
            end_coor.write(tmp[0]+'\t'+tmp[2]+'\n')
            end_coor_1.write(tmp[0]+'\t'+str(int(tmp[2])+1)+'\n')
            
        # close postion files:
        start_coor.close()
        start_coor_1.close()
        end_coor.close()
        end_coor_1.close()
        print 'Started linear spliced reads count for %s' % bamfile
        
        # mpileup get the number of spliced reads at circle start position and (start-1) postion.
        mpileup_start = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor') # A list object
        mpileup_start_1 = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor_1')
                
        # mpileup get the number of spliced reads at circle end position and (end+1) postion.
        mpileup_end = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor')
        mpileup_end_1 = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor_1')
        
        # get count
        startcount = self.submpileup(self.getreadscount(mpileup_start_1),self.getreadscount(mpileup_start))
        endcount = self.submpileup(self.getreadscount(mpileup_end),self.getreadscount(mpileup_end_1),left=False)

        # remove tmp files
        os.remove('start_coor')
        os.remove('start_coor_1')
        os.remove('end_coor')
        os.remove('end_coor_1')
        
        return startcount, endcount
コード例 #9
0
ファイル: bam.py プロジェクト: wachikuma/CNVpytor
 def pileup(self, chr_name, pos, ref, alt, tmp_file=".cnvpytor"):
     if not (chr_name in self.len):
         _logger.warning("Can not find chromosome '%s' in file '%s'." % (chr_name, self.filename))
         return
     _logger.debug("Pileup chromosome %s from filename %s" % (chr_name, self.filename))
     tmp_file += "_" + str(random.randint(0, 1e10)) + "_" + chr_name
     f = open(tmp_file, "w")
     for i in pos:
         print(chr_name, i, file=f)
     f.close()
     if self.reference_filename:
         mpile = pysam.mpileup("-r", chr_name, "-l", tmp_file, "--reference", self.reference_filename, self.filename)
     else:
         mpile = pysam.mpileup("-r", chr_name, "-l", tmp_file, self.filename)
     os.remove(tmp_file)
     pos_seq = dict([(int(x.split("\t")[1]), x.split("\t")[4].upper()) for x in mpile.split("\n") if x != ""])
     nref = [0] * len(pos)
     nalt = [0] * len(pos)
     for ix in range(len(pos)):
         if pos[ix] in pos_seq:
             nref[ix] = pos_seq[pos[ix]].count(ref[ix]) + pos_seq[pos[ix]].count(".") + pos_seq[pos[ix]].count(",")
             nalt[ix] = pos_seq[pos[ix]].count(alt[ix])
     return nref, nalt
コード例 #10
0
ファイル: consistency.py プロジェクト: ak352/melanomics
def GetCoverages(line, normal, tumor):
    [chrom, start, end] = line[1:4]
    counts = {}
    for p in pysam.mpileup("-r %s:%s-%s" % (chrom, str(int(start)+1), end), normal): #, chrom, start, end)
        seq = p.split("\t")[4].upper()
        #print seq
        bases = RemoveIndels(seq)
        for base in bases:
            if base not in counts:
                counts[base] = 0
            counts[base] += 1
        print counts

    return (0,0)
コード例 #11
0
ファイル: genecount.py プロジェクト: Voineagulab/DCC
    def genecount(self, circ_coor, bamfile, ref):
        """
        @circ_coor: quoted string, content with format "chr1\tstart\tend"
        @bamfile: quoted string
        @ref: quoted string
        """

        # process the circ_coordinates to left circ position and right circ position
        coor = open(circ_coor, 'r').readlines()[1:]
        start_coor = open('start_coor', 'w')
        end_coor = open('end_coor', 'w')

        for line in coor:
            tmp = line.split('\t')
            start_coor.write(tmp[0] + '\t' + tmp[1] + '\n')
            end_coor.write(tmp[0] + '\t' + tmp[2] + '\n')

        # close postion files:
        start_coor.close()
        end_coor.close()

        print 'Started linear gene expression counting for %s' % bamfile

        # mpileup get the read counts of the start and end positions
        mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l',
                                      'start_coor')  # A list object
        mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor')

        # get count
        startcount = self.getreadscount(mpileup_start, countmapped=True)
        endcount = self.getreadscount(mpileup_end, countmapped=True)

        # remove tmp files
        os.remove('start_coor')
        os.remove('end_coor')

        return startcount, endcount
コード例 #12
0
ファイル: genecount.py プロジェクト: KornfeldLab/DCC
 def genecount(self, circ_coor, bamfile, ref):
     """
     @circ_coor: quoted string, content with format "chr1\tstart\tend"
     @bamfile: quoted string
     @ref: quoted string
     """
     
     # process the circ_coordinates to left circ position and right circ position
     coor = open(circ_coor,'r').readlines()[1:]
     start_coor = open('start_coor','w')
     end_coor = open('end_coor','w')
     
     for line in coor:
         tmp = line.split('\t')
         start_coor.write(tmp[0]+'\t'+tmp[1]+'\n')
         end_coor.write(tmp[0]+'\t'+tmp[2]+'\n')
         
     # close postion files:
     start_coor.close()
     end_coor.close()
     
     print 'Started linear gene expression counting %s' % bamfile 
     
     # mpileup get the read counts of the start and end positions
     mpileup_start = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor') # A list object
     mpileup_end = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor')
     
     # get count
     startcount = self.getreadscount(mpileup_start,countmapped=True)
     endcount = self.getreadscount(mpileup_end,countmapped=True)
     
     # remove tmp files
     os.remove('start_coor')
     os.remove('end_coor')
     
     return startcount, endcount
コード例 #13
0
ファイル: calculate_depths.py プロジェクト: rruizcor/CFseq
def find_variant_depths(variants, bamfile, logfh, args):
    depths = {}
    if not variants: return depths
    logfh.write("Creating pileup from bamfile: {}\n".format(bamfile))
    pysam_pileup = pysam.mpileup("-B", "-d99999", '-Q0', 
                                 "-f", RESOURCE['ref_fa'], bamfile)
    varpos = sorted(variants.keys())
    for pos in varpos:
        variant = variants[pos]
        for pileline in pysam_pileup:
            pileinfo = PileLine(pileline)
            if pileinfo.pos == pos:
                depths[pos] = get_depths_from_pileline(variant, pileinfo,
                                                  pos, logfh, args.debug)
                break
    return depths
コード例 #14
0
ファイル: samtools_variants.py プロジェクト: ys4/ariba
    def _make_vcf_and_read_depths_files(self):
        if not os.path.exists(self.ref_fa + '.fai'):
            pysam.faidx(self.ref_fa)

        tmp_vcf = self.vcf_file + '.tmp'
        with open(tmp_vcf, 'w') as f:
            print(pysam.mpileup(
                '-t',
                'INFO/AD,INFO/ADF,INFO/ADR',
                '-L',
                '99999999',
                '-A',
                '-f',
                self.ref_fa,
                '-u',
                '-v',
                self.bam,
            ),
                  end='',
                  file=f)

        got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix,
                                          self.min_var_read_depth,
                                          self.min_second_var_read_depth,
                                          self.max_allele_freq)
        if got != 0:
            raise Error('Error parsing vcf file. Cannot contine')

        pysam.tabix_compress(self.outprefix + '.read_depths',
                             self.read_depths_file)
        pysam.tabix_index(self.read_depths_file,
                          seq_col=0,
                          start_col=1,
                          end_col=1)
        os.unlink(self.outprefix + '.read_depths')
        os.unlink(tmp_vcf)
コード例 #15
0
ファイル: sunModel.py プロジェクト: ifiddes/notch2nl_CNV
 def find_site_coverages(self, bam_in, min_depth=20):
     """
     Runs mpileup on each SUN position and finds allele fractions
     """
     bases = {"A", "T", "G", "C", "a", "t", "g", "c"}
     sun_results = defaultdict(list)
     n_vals = []
     for pos, (para, ref, alt, hg38_pos) in self.whitelist.iteritems():
         pos_str = "chr1:{0}-{0}".format(pos)
         pile_up = pysam.mpileup("-q", "20", "-Q", "20", "-r", pos_str,
                                 bam_in)
         if len(pile_up) == 0:
             continue
         pile_up_str = pile_up[0].split()
         if len(pile_up_str) != 6:
             continue
         pile_up_result = Counter(x.upper() for x in pile_up_str[4]
                                  if x in bases)
         if ref not in pile_up_result or alt not in pile_up_result:
             continue
         if sum(pile_up_result.itervalues()) < min_depth:
             continue
         frac = format_ratio(pile_up_result[alt],
                             sum(pile_up_result.values()))
         # invert fraction for Notch2 paralogs
         if para == "N":
             frac = 1 - frac
             n_vals.append(frac)
         else:
             sun_results[para].append([hg38_pos, frac])
     # calculate normalizing factor from mean value of Notch2 SUNs, then normalize other SUNs
     normalizing_factor = 1.0 * sum(n_vals) / len(n_vals)
     for para, result in sun_results.iteritems():
         sun_results[para] = [[pos, 2.0 * val / normalizing_factor]
                              for pos, val in result]
     return sun_results
コード例 #16
0
def calculate_mRNA_coverage(convert, transcript_lengths, coordinates_dict,
                            sequence_dict, goi_id, dataset, base_name):

    bamfile = pysam.AlignmentFile(
        "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" %
        (args.dataset, base_name), "rb")

    if convert[goi_id] in special_cases[dataset]:
        print("Special")
        print(goi_id)
        print(convert[goi_id])
        if (convert[goi_id] == "ADAMTS13WT") or (convert[goi_id] == "F9WT"):
            print(base_name, convert[goi_id])

            if base_name[0] in datasetinfo[dataset].wt:
                print("yep it's here")
                string = goi_id + ":" + str(0) + "-" + str(
                    transcript_lengths[convert[goi_id]])
                mpileup = pysam.mpileup(
                    "-a", "-r", string,
                    "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam"
                    % (dataset, base_name)).split("\n")
                mrna_coverage = {}

                if len(mpileup) == 1 and mpileup[0] == "":
                    for i in range(0, transcript_lengths[convert[goi_id]]):
                        mrna_coverage[i] = 0
                    print(
                        "Finished filling in coverage with zeroes because no reads aligned to this gene for %s"
                        % base_name)

                for i in range(0, len(mpileup)):
                    if mpileup[i] == "":
                        continue
                    s = mpileup[i].split("\t")
                    try:
                        mrna_position = int(s[1]) - 1
                    except:
                        print(s)
                        print(base_name)
                    coverage = int(s[3])
                    mrna_coverage[i] = int(s[3])

                print("Finished calculating mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage
            else:
                mrna_coverage = {}
                for i in range(0, transcript_lengths[convert[goi_id]]):
                    mrna_coverage[i] = 0
                print("Finished fudging mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage

        elif (convert[goi_id]
              == "ADAMTS13P118P") or (convert[goi_id] == "F9opt1") or (
                  convert[goi_id] == "F91A") or (convert[goi_id] == "F9V107V"):
            print(base_name, convert[goi_id])
            if base_name[0] in datasetinfo[dataset].opt1:
                print("yep it's here")
                string = goi_id + ":" + str(0) + "-" + str(
                    transcript_lengths[convert[goi_id]])
                mpileup = pysam.mpileup(
                    "-a", "-r", string,
                    "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam"
                    % (dataset, base_name)).split("\n")
                mrna_coverage = {}

                if len(mpileup) == 1 and mpileup[0] == "":
                    for i in range(0, transcript_lengths[convert[goi_id]]):
                        mrna_coverage[i] = 0
                    print(
                        "Finished filling in coverage with zeroes because no reads aligned to this gene for %s"
                        % base_name)

                for i in range(0, len(mpileup)):
                    if mpileup[i] == "":
                        continue
                    s = mpileup[i].split("\t")
                    try:
                        mrna_position = int(s[1]) - 1
                    except:
                        print(s)
                        print(base_name)
                    coverage = int(s[3])
                    mrna_coverage[i] = int(s[3])

                print("Finished calculating mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage
            else:
                mrna_coverage = {}
                for i in range(0, transcript_lengths[convert[goi_id]]):
                    mrna_coverage[i] = 0
                print("Finished fudging mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage

        elif (convert[goi_id] == "F92A"):
            print("hello")
            print(base_name, convert[goi_id])
            print(datasetinfo[dataset].opt2)
            if base_name[0] in datasetinfo[dataset].opt2:
                string = goi_id + ":" + str(0) + "-" + str(
                    transcript_lengths[convert[goi_id]])
                mpileup = pysam.mpileup(
                    "-a", "-r", string,
                    "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam"
                    % (dataset, base_name)).split("\n")
                mrna_coverage = {}

                if len(mpileup) == 1 and mpileup[0] == "":
                    for i in range(0, transcript_lengths[convert[goi_id]]):
                        mrna_coverage[i] = 0
                    print(
                        "Finished filling in coverage with zeroes because no reads aligned to this gene for %s"
                        % base_name)

                for i in range(0, len(mpileup)):
                    if mpileup[i] == "":
                        continue
                    s = mpileup[i].split("\t")
                    try:
                        mrna_position = int(s[1]) - 1
                    except:
                        print(s)
                        print(base_name)
                    coverage = int(s[3])
                    mrna_coverage[i] = int(s[3])

                print("Finished calculating mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage

            else:
                mrna_coverage = {}
                for i in range(0, transcript_lengths[convert[goi_id]]):
                    mrna_coverage[i] = 0
                print("Finished fudging mRNA coverage for %s" % base_name,
                      "at", time.ctime())
                return mrna_coverage

    else:

        string = goi_id + ":" + str(0) + "-" + str(
            transcript_lengths[convert[goi_id]])
        mpileup = pysam.mpileup(
            "-a", "-r", string,
            "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" %
            (dataset, base_name)).split("\n")

        mrna_coverage = {}

        print(base_name)

        if len(mpileup) == 1 and mpileup[0] == "":
            for i in range(0, transcript_lengths[convert[goi_id]]):
                mrna_coverage[i] = 0
            print(
                "Finished filling in coverage with zeroes because no reads aligned to this gene for %s"
                % base_name)

        else:

            for i in range(0, len(mpileup)):
                if mpileup[i] == "":
                    continue
                s = mpileup[i].split("\t")
                try:
                    mrna_position = int(s[1]) - 1
                except:
                    print(s)
                    print(base_name)
                coverage = int(s[3])
                mrna_coverage[i] = int(s[3])

            print("Finished calculating mRNA coverage for %s" % base_name,
                  "at", time.ctime())

        return mrna_coverage
コード例 #17
0
    def filter(self, in_mutation_file, in_bam, output):

        samfile = pysam.Samfile(in_bam, "rb")
        
        srcfile = open(in_mutation_file,'r')
        hResult = open(output,'w')
        if self.header_flag:
            header = srcfile.readline().rstrip('\n')  
            newheader = "tmismatch_count\tmismatch_rate"
            print >> hResult, (header +"\t"+ newheader)
        
        for line in srcfile:
            line = line.rstrip()
            itemlist = line.split('\t')
    
            # input file is annovar format (not zero-based number)
            chr = itemlist[0]
            start = (int(itemlist[1]) - 1)
            end = int(itemlist[2])
            ref = itemlist[3]
            alt = itemlist[4]
            
            max_mismatch_count = 0
            max_mismatch_rate = 0
            
            # if (ref == '-' or alt == '-'):
            #     self.write_result_file(line, hResult, '---', '---')
            #     continue
            region = chr +":"+str(max(0, (start - self.search_length + 1))) +"-"+ str(end + self.search_length)

            ####
            # print region
            for mpileup in pysam.mpileup( '-BQ', '0', '-d', '10000000', "-q", self.base_qual_thres, "-r", region, in_bam ):
                # print mpileup.rstrip()

                #
                # Prepare mpileup data
                #
                mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' )
                mp_list_len = len( mp_list )
                coordinate = mp_list[ 0:3 ]

                #
                # skip if depth is 0
                #
                if mp_list[ 3 ] == '0' or ( mp_list_len > 6 and mp_list[ 6 ] == '0' ):
                    continue
                
                #
                # skip if depth < min_depth
                #
                if int(mp_list[ 3 ]) < self.min_depth:
                    continue

                #
                depth = mp_list[ 3 ]
                read_bases = mp_list[ 4 ]
                qual_list = mp_list[ 5 ]

                #
                # position id,
                # mpileup output 4th row(number of read covering the site),
                # 5th row(read bases),
                # 6th row(base quality)
                #
                indel = auto_vivification()

                #
                # Look for deletion/insertion and save info in 'indel' dictionary
                #
                #   ([\+\-])[0-9]+[ACGTNacgtn]+
                #
                # m.group(1): + or - (deletion/insertion)
                # m.group(2): number of deletion/insertion
                # m.group(3): nucleotides
                #
                deleted = 0
                iter = self.target.finditer( read_bases )
                for m in iter:
                    site = m.start()
                    type = m.group( 1 )
                    num = m.group( 2 )
                    bases = m.group( 3 )[ 0:int( num ) ]
                    if bases.islower():
                        strand = ( '-', '+' )
                    else:
                        strand = ( '+', '-' )

                    key = '\t'.join( coordinate + [ bases.upper() ] )
                    if type in indel and key in indel[ type ]:
                        indel[ type ][ key ][ strand[ 0 ] ] += 1
                    else:
                        indel[ type ][ key ][ strand[ 0 ] ] = 1
                        indel[ type ][ key ][ strand[ 1 ] ] = 0

                    read_bases = read_bases[ 0:site - deleted ] + read_bases[ site + int(num) + len( num ) + 1 - deleted: ]
                    deleted += 1 + len( num ) + int( num )

                #
                # Remove '^.' and '$'
                #
                read_bases = self.remove_chr.sub( '', read_bases )
                read_bases = read_bases.translate( None, '$' ) 

                #
                # Error check
                #
                if len( read_bases ) != len( qual_list ):
                    logging.error( "mpileup data is not good: {0}, {1}".format( mpileup, read_bases ) )
                    exit(1)

                #
                for type in ( '+', '-' ):
                    if type in indel:
                        for key in indel[ type ].keys():
                            start_pos = mp_list[ 1 ]
                            
                            mismatch_count = ( indel[ type ][ key ][ '-' ] + indel[ type ][ key ][ '+' ])
                            mismatch_rate = (float(mismatch_count) / float(depth))

                            if mismatch_rate >= max_mismatch_rate:
                                start_pos = int(start_pos)
                                end_pos   = int(start_pos)

                                if (type == '-'):
                                    start_pos = int(start_pos) + 1
                                    end_pos = int(start_pos) + len((key.split('\t'))[3]) - 1 

                                # print "m: " + str(start_pos) +"-"+ str(end_pos)
                                # print "o: " + str(start) +"-"+ str(end)
                                # print mismatch_count
                                # print mismatch_rate

                                if ((start_pos - self.neighbor <= int(start) + 1 and int(start) + 1 <= self.neighbor + end_pos) 
                                  or(start_pos - self.neighbor <= int(end)      and  int(end)       <= self.neighbor + end_pos)): 

                                    max_mismatch_count = mismatch_count
                                    max_mismatch_rate  = mismatch_rate

            ####
            # print "mmc: " + str(max_mismatch_count)
            # print "mm:  " + str(self.min_mismatch)
            if(max_mismatch_count <= self.min_mismatch or max_mismatch_rate <= self.af_thres):
                self.write_result_file(line, hResult, max_mismatch_count, max_mismatch_rate)

        ####
        hResult.close()
        srcfile.close()
コード例 #18
0
ファイル: example.py プロジェクト: Pfiver/RNA-Seqlyze

class Counter:
    mCounts = 0

    def __call__(self, alignment):
        self.mCounts += 1


c = Counter()
samfile.fetch(region="chr1:10-200", callback=c)
print "counts=", c.mCounts

print "########### Calling a samtools command line function ############"

for p in pysam.mpileup("-c", "ex1.bam"):
    print str(p)

print pysam.mpileup.getMessages()

print "########### Investigating headers #######################"

# playing arount with headers
samfile = pysam.Samfile("ex3.sam", "r")
print samfile.references
print samfile.lengths
print samfile.text
print samfile.header
header = samfile.header
samfile.close()
コード例 #19
0
def get_coverage(bams, reference, chromosome, merged, length, depth,
                 proportion):
    """
        Uses pysam tools to create mpileup, and determines the total coverage
        for either pairs of merged files, or individual merged files.
    """
    removed_bams = []
    if merged:
        # Must be using pairs of matched files
        for b1, b2 in bams:
            total_bases = 0
            reads_one = pysam.mpileup('-D', b1)
            reads_two = pysam.mpileup('-D', b2)
            line_one = reads_one[0]
            line_two = reads_two[0]
            line_one_i = 1
            line_two_i = 1
            while True:
                if chromosome not in line_one:
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                    continue
                elif chromosome not in line_two:
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                    continue
                lone_split = line_one.split('\t')
                p1 = int(lone_split[1])
                ltwo_split = line_two.split('\t')
                p2 = int(ltwo_split[1])
                if p1 > p2:
                    cov = int(ltwo_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                elif p2 < p1:
                    cov = int(lone_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                elif p1 == p2:
                    cov = int(lone_split[3]) + int(ltwo_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
            while True:
                if chromosome not in line_one:
                    cov = int(lone_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                    continue
                cov = int(lone_split[3])
                try:
                    line_one = reads_one[line_one_i]
                except IndexError:
                    break
                if cov >= depth:
                    total_bases += 1
                line_one_i += 1

            while line_two:
                if chromosome not in line_two:
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                    continue
                try:
                    line_two = reads_two[line_two_i]
                except IndexError:
                    break
                cov = int(ltwo_split[3])
                line_two = reads_two.readline()
                if cov >= depth:
                    total_bases += 1
                line_two_i += 1
            if float(total_bases) / length < proportion:
                print b1, b2
                removed_bams.extend([b1, b2])
            # Might need to implement simple one-pass strategy here
    else:
        for b in bams:
            total_bases = 0
            reads = pysam.mpileup('-D', b)
            for r in reads:
                if chromosome in r:
                    r_cov = int(r.strip().split('\t')[3])
                    if r_cov >= depth:
                        total_bases += 1
            if float(total_bases) / length < proportion:
                print b
                removed_bams.append(b)
    remove_files(removed_bams)
コード例 #20
0
def createAncientReads(individuals, snpFile, bamFilePath):

    sFile = open(snpFile, 'r')
    outFile = open("AncientReads.output",
                   'w')  # will OVERWRITE contents of existing file

    header = 'Chrom\tPos'

    for i in range(len(
            individuals)):  # need to create header for all individuals in list

        header = f"{header}\t{individuals[i]}_der\t{individuals[i]}_anc\t{individuals[i]}_other"

    outFile.write(f"{header}\n")

    for line in sFile:

        lineList = line.split(
        )  # should be in "rsID | chrom | pos | physPos | refAllele | newAllele" format
        writtenLine = f"{lineList[1]}\t{lineList[3]}"  # resets/starts the line we'll write to AncientReads.output file

        for i in range(len(individuals)):  # run mpileup on each individual

            # in format "samtools mpileup -r chromosome:pos-pos bamFile"
            # Even if you only want one positon, do pos-pos. Otherwise
            # You'll get reads from that position to the end of the
            # chromosome.

            if (lineList[1] == '23'):  # if X chromosome
                reads = pysam.mpileup(
                    '-r', f"X:{lineList[3]}-{lineList[3]}",
                    f"{bamFilePath}{individuals[i]}.sorted.bam")
                reads = reads.split()

            elif (lineList[1] == '24'):  # if Y chromosome
                reads = pysam.mpileup(
                    '-r', f"Y:{lineList[3]}-{lineList[3]}",
                    f"{bamFilePath}{individuals[i]}.sorted.bam")
                reads = reads.split()

            else:
                reads = pysam.mpileup(
                    '-r', f"{lineList[1]}:{lineList[3]}-{lineList[3]}",
                    f"{bamFilePath}{individuals[i]}.sorted.bam")
                reads = reads.split()
            # list of strings ["chrom", "pos", "N", "total reads", "reads", "read quality"]

            if not reads:  # no read was found
                ancReads = 0
                derReads = 0
                otherReads = 0

            else:
                totalReads = int(reads[3])
                ancReads = reads[4].count(lineList[4])
                derReads = reads[4].count(lineList[5])
                otherReads = totalReads - ancReads - derReads

            writtenLine = f"{writtenLine}\t{derReads}\t{ancReads}\t{otherReads}"  # append reads to line

        outFile.write(f"{writtenLine}\n")

    sFile.close()  # for good practice
    outFile.close()
コード例 #21
0
ファイル: woodfox.py プロジェクト: cpwardell/bin
sitedepths=[]
totalevents=[]
## Iterate through every indel 
for indel in indels:
    ## Set properties of indel
    CHROM=indel[1].split(":")[0]
    POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1
    
    logging.debug(str(CHROM)+":"+str(POS))

    # Set the window size and reference genome
    window=10
    genome="/home/chris_w/resources/b37/human_g1k_v37.fasta"

    # Produce a pileup of bam over the specified window
    pile=pysam.mpileup("-f","/home/chris_w/resources/b37/human_g1k_v37.fasta","-r",CHROM+":"+str(POS-window)+"-"+str(POS+window),args.n1)
    
    # Count how many SNVs and indels are present
    # Indel count is unreliable, e.g. a 3bp insertion could be "+3GAC" which is 4 characters
    # However, this inaccuracy is acceptable as indels are arguably worse than SNVs when considering
    # the reliability of a region
    # We can ignore "^" and "$" characters, as they denote the start/end of reads
    # Note that "^" is ALWAYS followed by another character, so we remove that, too
    siteevents=0
    sitedepth=0
    sites=0
    for site in pile:
	bases=site.split("\t")
	events=len(bases[4])-bases[4].count(".")-bases[4].count(",")-2*bases[4].count("^")-bases[4].count("$")
	siteevents=siteevents+events
	sitedepth=sitedepth+int(bases[3])
コード例 #22
0
ファイル: example.py プロジェクト: AndrewNguyenF3/pysam

print "########### Using a callback object ###### "

class Counter:
    mCounts = 0
    def __call__(self, alignment):
        self.mCounts += 1

c = Counter()
samfile.fetch( region = "chr1:10-200", callback = c )
print "counts=", c.mCounts

print "########### Calling a samtools command line function ############"

for p in pysam.mpileup( "-c", "ex1.bam" ):
    print str(p)

print pysam.mpileup.getMessages()

print "########### Investigating headers #######################"

# playing arount with headers
samfile = pysam.Samfile( "ex3.sam", "r" )
print samfile.references
print samfile.lengths
print samfile.text
print samfile.header
header = samfile.header
samfile.close()
コード例 #23
0
ファイル: bam_rsub.py プロジェクト: drevoz/ngs_misc
def main():

    args = parse_command_line_arguments()
     
    # defaults and naming 
    if not args.max_length:
        args.max_length = 1000
    if not args.max_edit_dist and not args.max_derived:
        args.max_edit_dist = 1000
    if args.max_edit_dist:
        outbam = '%slen%dNM%d.bam' % (args.bam_file[:-3], args.max_length, args.max_edit_dist)
    elif args.max_derived:
        outbam = '%slen%dder%d.bam' % (args.bam_file[:-3], args.max_length, args.max_derived)

    with pysam.AlignmentFile(args.bam_file, "rb") as samfile:
        print '%d reads before filtering' % samfile.count()
        i = 0
        with  pysam.AlignmentFile(outbam, "wb", template=samfile) as tmpfile:
            for read in samfile.fetch():
                edit_dist = read.get_tag('NM')
                read_length = read.query_length
                if read_length <= args.max_length:
                    if args.max_edit_dist:
                        if edit_dist <= args.max_edit_dist:
                            tmpfile.write(read)
                            i += 1
                    elif args.max_derived:
                        if 100*edit_dist/read_length < args.max_derived:
                            tmpfile.write(read)
                            i += 1
        print '%d reads after filtering' % i

    # pileup generation
    if not args.filter_only:
        pileup = pysam.mpileup('-f',args.ref_fasta,outbam)
        os.remove(outbam)

        # pileup parsing 
        subst = {'AA':0,'TT':0,'CC':0,'GG':0,
                 'AC':0,'AT':0,'AG':0,
                 'CA':0,'CT':0,'CG':0,
                 'TA':0,'TC':0,'TG':0,
                 'GA':0,'GC':0,'GT':0,
                 'ins':0,'del':0,'un':0,'un_ref':0}
        for line in pileup:
            toks = line.strip('\n').split('\t')
            ref = toks[2].upper()
            alt = parseString(ref, toks[4]).__repr__()
            for alt_type,count in alt.iteritems():
                if ref in list('ACTG'):
                    try:
                        subst[alt_type] += count
                    except:
                        print alt, toks[1]
                        sys.exit()
                else:
                    subst['un_ref'] += count

        # output
        keyorder = "AA\tTT\tCC\tGG\tAC\tAT\tAG\tCA\tCT\tCG\tTA\tTC\tTG\tGA\tGC\tGT\tins\tdel\tun\tno_ref"
        print  keyorder
        subst = OrderedDict(sorted(subst.items(), key=lambda i: keyorder.index(i[0])))
        print  '\t'.join([str(x) for x in subst.values()])
コード例 #24
0
def get_coverage(bams, reference, chromosome, merged, length, depth, proportion):
    """
        Uses pysam tools to create mpileup, and determines the total coverage
        for either pairs of merged files, or individual merged files.
    """
    removed_bams = []
    if merged:
       # Must be using pairs of matched files
        for b1, b2 in bams:
            total_bases = 0
            reads_one = pysam.mpileup('-D', b1)
            reads_two = pysam.mpileup('-D', b2)
            line_one = reads_one[0]
            line_two = reads_two[0]
            line_one_i = 1
            line_two_i = 1
            while True: 
                if chromosome  not in line_one:
                    try:
                        line_one = reads_one[line_one_i] 
                    except IndexError:
                        break
                    line_one_i += 1
                    continue
                elif chromosome not in line_two:
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                    continue
                lone_split = line_one.split('\t')
                p1 = int(lone_split[1])
                ltwo_split = line_two.split('\t')
                p2 = int(ltwo_split[1])
                if p1 > p2:
                    cov = int(ltwo_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                elif p2 < p1:
                    cov = int(lone_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                elif p1 == p2:
                    cov = int(lone_split[3]) + int(ltwo_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
            while True:
                if chromosome  not in line_one:
                    cov = int(lone_split[3])
                    if cov >= depth:
                        total_bases += 1
                    try:
                        line_one = reads_one[line_one_i]
                    except IndexError:
                        break
                    line_one_i += 1
                    continue
                cov = int(lone_split[3])
                try:
                    line_one = reads_one[line_one_i]
                except IndexError:
                    break
                if cov >= depth:
                    total_bases += 1
                line_one_i += 1

            while line_two:
                if chromosome  not in line_two:
                    try:
                        line_two = reads_two[line_two_i]
                    except IndexError:
                        break
                    line_two_i += 1
                    continue
                try:
                    line_two = reads_two[line_two_i]
                except IndexError:
                    break
                cov = int(ltwo_split[3])
                line_two = reads_two.readline()
                if cov >= depth:
                    total_bases += 1
                line_two_i += 1
            if float(total_bases)/length < proportion:
                print b1, b2
                removed_bams.extend([b1, b2])
            # Might need to implement simple one-pass strategy here
    else:
        for b in bams:
            total_bases = 0
            reads = pysam.mpileup('-D', b)
            for r in reads:
                if chromosome in r:
                    r_cov = int(r.strip().split('\t')[3])
                    if r_cov >= depth:
                        total_bases += 1
            if float(total_bases)/length < proportion:
                print b
                removed_bams.append(b)
    remove_files(removed_bams)
コード例 #25
0
ファイル: Bam.py プロジェクト: KatyBrown/ExtendTranscripts
def calculateCoverage(contig_file,
                      contig,
                      bam_file,
                      bamnam,
                      bedtool,
                      bednam,
                      outdir,
                      typ='samtools'):

    out_tab = "%s/%s_coverage_%s_%s.tsv" % (outdir, contig, bamnam, typ)
    # index the bam file (if needed)
    if not os.path.exists("%s.bai" % bam_file):
        pysam.index(bam_file)

    if typ == 'bedtools':
        with warnings.catch_warnings():
            # this raises a warning in Python 3.8 that is not relevant
            # ignore it
            # https://github.com/benoitc/gunicorn/issues/2091
            warnings.filterwarnings("ignore")
            bam = pybedtools.BedTool(bam_file)
            # d=True - calculate depth of coverage for every position
            # in the bam file
            cov = bedtool.coverage(bam, d=True)
        cov_df = cov.to_dataframe(
            names=['chromosome', 'start', 'end', 'pos', 'coverage'])

        cov_df['bam'] = bamnam
    elif typ == 'samtools':
        # -A - keep anomalous read pairs
        # -B - don't calculate per base alignment quality
        # -C 0 - don't adjust based on mapping quality
        # -d 0 - don't limit coverage depth
        # -Q 0 - minimum quality score of 0
        # -aa - output every position even if coverage is 0
        cov = pysam.mpileup(bam_file, "-f", contig_file, "-A", "-B", "-C", "0",
                            "-d", "0", "-Q", "0", "-aa")

        # convert the coverage table into a dataframe
        cov = [x.split("\t") for x in cov.split("\n")]

        cov_df = pd.DataFrame(cov,
                              columns=[
                                  'chromosome', 'pos', 'reference_base',
                                  'coverage', 'base_quality',
                                  'alignment_quality'
                              ])
        cov_df['bam'] = bamnam

        # the final position is always NA for some reason
        cov_df['coverage'] = cov_df['coverage'].fillna(0)
        cov_df = cov_df[cov_df['pos'].notnull()]

        # convert to int
        cov_df['coverage'] = cov_df['coverage'].astype(int)
        cov_df['pos'] = cov_df['pos'].astype(int)
        cov_df['reference_base'] = cov_df['reference_base'].str.upper()

    # save the coverage table
    cov_df = cov_df[cov_df['chromosome'] == contig]
    cov_df.to_csv(out_tab, sep="\t", index=None)
    return (cov_df)