Example #1
0
 def testDoubleCalling(self):
     # The following would fail if there is an
     # issue with stdout being improperly caught.
     retvals = pysam.idxstats(
         os.path.join(DATADIR, "ex1.bam"))
     retvals = pysam.idxstats(
         os.path.join(DATADIR, "ex1.bam"))
Example #2
0
 def testDoubleCalling(self):
     # The following would fail if there is an
     # issue with stdout being improperly caught.
     retvals = pysam.idxstats(
         os.path.join(DATADIR, "ex1.bam"))
     retvals = pysam.idxstats(
         os.path.join(DATADIR, "ex1.bam"))
Example #3
0
def test_merge_and_switch():
    test_bam1 = bam.BamFile(os.path.join(dir, "chr19_window.bam"),
                            "samtools",
                            no_initial_index=True)
    test_bam2 = bam.BamFile(os.path.join(dir, "chrX_window1.bam"),
                            "samtools",
                            no_initial_index=True)
    test_bam3 = bam.BamFile(os.path.join(dir, "chrX_window2.bam"),
                            "samtools",
                            no_initial_index=True)
    merged = bam.samtools_merge("samtools",
                                [test_bam1.filepath, test_bam2.filepath],
                                os.path.join(dir, "merged1"), 1)
    merged = bam.BamFile(os.path.join(dir, "merged1.merged.bam"),
                         "samtools",
                         no_initial_index=True)
    a = pysam.idxstats(test_bam1.filepath)
    test1_reads = sum([
        int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")]
        if len(k) > 3
    ])
    a = pysam.idxstats(test_bam2.filepath)
    test2_reads = sum([
        int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")]
        if len(k) > 3
    ])
    a = pysam.idxstats(test_bam3.filepath)
    test3_reads = sum([
        int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")]
        if len(k) > 3
    ])
    a = pysam.idxstats(merged.filepath)
    merged1_reads = sum([
        int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")]
        if len(k) > 3
    ])
    assert merged1_reads == test1_reads + test2_reads
    swapped = bam.switch_sex_chromosomes_sambamba("samtools", "sambamba",
                                                  merged.filepath,
                                                  test_bam3.filepath, "chrX",
                                                  dir, "swapped", 1, {
                                                      "CL": ["foo"],
                                                      "ID": "xyalign"
                                                  })
    swapped = bam.BamFile(os.path.join(dir, "swapped.merged.bam"),
                          "samtools",
                          no_initial_index=True)
    a = pysam.idxstats(swapped.filepath)
    swapped_reads = sum([
        int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")]
        if len(k) > 3
    ])
    assert swapped_reads == test1_reads + test3_reads
    header = read_bed(os.path.join(dir, "swapped.header.sam"))
    assert ["@PG", "ID:xyalign", "CL:foo"] in header
def bam_blacklisted_reads(bam_handle,
                          chroms_to_ignore,
                          blackListFileName=None):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    import pysam
    import deeptools.mapReduce as mapReduce

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = mapReduce.BED_to_interval_tree(open(blackListFileName, "r"))
    for chrom in bl.keys():
        if not chroms_to_ignore or chrom not in chroms_to_ignore:
            for reg in bl[chrom].find(0, chromLens[chrom]):
                blacklisted += bam_handle.count(reference=chrom,
                                                start=reg.start,
                                                end=reg.end)

    return blacklisted
Example #5
0
    def _setup(self, config, temp):
        with open(os.path.join(temp, "contigs.table"), "w") as handle:
            handle.write("ID\tSize\tNs\tHits\n")

            # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
            for line in "".join(pysam.idxstats(self._input_file)).split('\n'):
                line = line.strip()
                if not line:
                    continue

                name, size, hits, _ = line.split('\t')
                name = contig_name_to_plink_name(name)
                if name is None or not (name.isdigit() or name == 'X'):
                    continue
                elif name not in self._contigs:
                    # Excluding contigs is allowed
                    continue

                if int(size) != self._contigs[name]['Size']:
                    raise NodeError(
                        "Size mismatch between database and BAM; "
                        "expected size %i, found %i for contig %r" %
                        (int(size), self._contigs[name]['Size'], name))

                row = {
                    'ID': name,
                    'Size': self._contigs[name]['Size'],
                    'Ns': self._contigs[name]['Ns'],
                    'Hits': hits,
                }

                handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row))

        CommandNode._setup(self, config, temp)
Example #6
0
def getNumReads(bamfile):
    '''count number of reads in bam file.

    This methods works through pysam.idxstats.

    Arguments
    ---------
    bamfile : string
        Filename of :term:`bam` formatted file. The file needs
        to be indexed.
    Returns
    -------
    nreads : int
        Number of reads
    '''

    lines = pysam.idxstats(bamfile)

    try:
        nreads = sum(
            map(int,
                [x.split("\t")[2] for x in lines if not x.startswith("#")]))

    except IndexError, msg:
        raise IndexError(
            "can't get number of reads from bamfile, msg=%s, data=%s" %
            (msg, lines))
Example #7
0
def test_idxstats_parse_split_lines():
    bam_filename = "./pysam_data/ex2.bam"
    lines = pysam.idxstats(
        bam_filename, split_lines=True
    )  # Test pysam 0.8.X style output, which returns a list of lines
    for line in lines:
        _seqname, _seqlen, nmapped, _nunmapped = line.split()
Example #8
0
def print_sex(bam):
    """
    Print sex based on chr x ratio
    
    Args:
	bam (str): Path to bam file
    """

    idxstats = pysam.idxstats(bam)
    chr_ratio = []
    # Calculate read / chromosome length ratio per chromosome
    for chr in idxstats[0:24]:
        chr = chr.strip('\n').split('\t')
        chr_length = float(chr[1])
        chr_mapped = float(chr[2])
        ratio = chr_mapped / chr_length
        chr_ratio.append(ratio)

    chr_ratio_std = numpy.std(chr_ratio)
    chr_ratio_mean = numpy.mean(chr_ratio)

    chr_x = idxstats[22].strip('\n').split('\t')
    chr_x_ratio = float(chr_x[2]) / float(chr_x[1])

    if ((chr_x_ratio > chr_ratio_mean - (2 * chr_ratio_std))
            and (chr_x_ratio < chr_ratio_mean + (2 * chr_ratio_std))):
        print 'female'
    elif (chr_x_ratio < chr_ratio_mean - (2 * chr_ratio_std)):
        print 'male'
    else:
        print "unkown"
Example #9
0
def bam_total_reads(bam_handle, chroms_to_ignore):
    """Count the total number of mapped reads in a BAM file, filtering
    the chromosome given in chroms_to_ignore list
    """
    if chroms_to_ignore:
        import pysam

        lines = pysam.idxstats(bam_handle.filename)
        lines = toString(lines)
        if type(lines) is str:
            lines = lines.strip().split('\n')
        if len(lines) == 0:
            # check if this is a test running under nose
            # in which case it will fail.
            if len([val for val in sys.modules.keys() if val.find("nose") >= 0]):
                sys.stderr.write("To run this code inside a test use disable "
                                 "output buffering `nosetest -s`\n".format(bam_handle.filename))
            else:
                sys.stderr.write("Error running idxstats on {}\n".format(bam_handle.filename))
        tot_mapped_reads = 0
        for line in lines:
            chrom, _len, nmapped, _nunmapped = line.split('\t')
            if chrom not in chroms_to_ignore:
                tot_mapped_reads += int(nmapped)

    else:
        tot_mapped_reads = bam_handle.mapped

    return tot_mapped_reads
def get_contigs_with_reads(bam_path: str,
                           with_length: bool = False) -> Generator:
    """
    Get all contigs with reads mapped to them

    Args:
        bam_path(str): path to bam file

        with_length(bool): also yield the length of the contig

    Yields:
        contig(str)

    """
    for line in pysam.idxstats(bam_path).split('\n'):
        try:
            contig, contig_len, mapped_reads, unmapped_reads = line.strip(
            ).split()
            mapped_reads, unmapped_reads = int(mapped_reads), int(
                unmapped_reads)
            if mapped_reads > 0 or unmapped_reads > 0:
                if with_length:
                    yield contig, int(contig_len)
                else:
                    yield contig
        except ValueError:
            pass
Example #11
0
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    lines = toString(lines)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = GTF(blackListFileName)
    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
Example #12
0
    def _setup(self, config, temp):
        with open(os.path.join(temp, "contigs.table"), "w") as handle:
            handle.write("ID\tSize\tNs\tHits\n")

            # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
            for line in "".join(pysam.idxstats(self._input_file)).split("\n"):
                line = line.strip()
                if not line:
                    continue

                name, size, hits, _ = line.split("\t")
                name = self._mapping.get(name, name)
                if name not in self._contigs:
                    # Excluding contigs is allowed
                    continue

                row = {
                    "ID": name,
                    "Size": self._contigs[name]["Size"],
                    "Ns": self._contigs[name]["Ns"],
                    "Hits": hits,
                }

                handle.write("{ID}\t{Size}\t{Ns}\t{Hits}\n".format(**row))

        CommandNode._setup(self, config, temp)
Example #13
0
def getNumReads(bamfile):
    '''count number of reads in bam file.

    This methods works through pysam.idxstats.

    Arguments
    ---------
    bamfile : string
        Filename of :term:`bam` formatted file. The file needs
        to be indexed.
    Returns
    -------
    nreads : int
        Number of reads
    '''

    lines = pysam.idxstats(bamfile).splitlines()

    try:
        nreads = sum(
            map(int, [x.split("\t")[2]
                      for x in lines if not x.startswith("#")]))

    except IndexError, msg:
        raise IndexError(
            "can't get number of reads from bamfile, msg=%s, data=%s" %
            (msg, lines))
Example #14
0
def test_idxstats_parse():
    bam_filename = "./pysam_data/ex2.bam"
    idxstats_string = pysam.idxstats(bam_filename, split_lines=False)  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    lines = idxstats_string.splitlines()
    for line in lines:
        splt = line.split("\t")
        _seqname, _seqlen, nmapped, _nunmapped = splt
Example #15
0
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    lines = toString(lines)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = GTF(blackListFileName)
    regions = []
    for chrom in bl.chroms:
        if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens:
            for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]):
                regions.append([bam_handle.filename, chrom, reg[0], reg[1]])

    if len(regions) > 0:
        import multiprocessing
        if len(regions) > 1 and numberOfProcessors > 1:
            pool = multiprocessing.Pool(numberOfProcessors)
            res = pool.map_async(bam_blacklisted_worker, regions).get(9999999)
        else:
            res = [bam_blacklisted_worker(x) for x in regions]
        for val in res:
            blacklisted += val

    return blacklisted
Example #16
0
    def _coverage(self,bam_obj):
        stats = pysam.idxstats(bam_obj.filename).rstrip().split('\n')
        tot_reads = sum([int(x.split('\t')[2]) for x in stats])

        tot_bases = sum(bam_obj.lengths)
        
        return tot_reads/tot_bases*self._avg_read_len(bam_obj)
Example #17
0
def bam_total_reads(bam_handle, chroms_to_ignore):
    """Count the total number of mapped reads in a BAM file, filtering
    the chromosome given in chroms_to_ignore list
    """
    if chroms_to_ignore:
        import pysam

        lines = pysam.idxstats(bam_handle.filename)
        lines = toString(lines)
        if type(lines) is str:
            lines = lines.strip().split('\n')
        if len(lines) == 0:
            # check if this is a test running under nose
            # in which case it will fail.
            if len([val for val in sys.modules.keys() if val.find("nose") >= 0]):
                sys.stderr.write("To run this code inside a test use disable "
                                 "output buffering `nosetest -s`\n".format(bam_handle.filename))
            else:
                sys.stderr.write("Error running idxstats on {}\n".format(bam_handle.filename))
        tot_mapped_reads = 0
        for line in lines:
            chrom, _len, nmapped, _nunmapped = line.split('\t')
            if chrom not in chroms_to_ignore:
                tot_mapped_reads += int(nmapped)

    else:
        tot_mapped_reads = bam_handle.mapped

    return tot_mapped_reads
def main(args):
    # Fail-fast check for file existence
    for b in args.bams:
        if not os.path.isfile(b):
            print("Could not find file: {}!".format(str(b)))
            sys.exit()

    data = {}
    with smartOut(args.output) as out:
        for b in args.bams:
            data[b] = bamStats()
            for line in pysam.idxstats(b).split('\n'):
                segs = line.split('\t')
                if len(segs) < 4:
                    continue
                data[b].addChr(segs[0], int(segs[1]), int(segs[2]),
                               int(segs[3]))

        out.write(
            "BamName\tTotalReads\tMappedReads\tUnmappedReads\tMapProportion\tRawXcov\tMapXcov\tavgRawChrCov\tavgMapChrCov\n"
        )
        for b in data:
            v = data[b]
            v.calculateStats()
            out.write("{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\n"\
                      .format(b, v.totalReads, v.mappedReads, v.unmappedReads, v.mapProportion, v.rawXcov, v.mapXcov, v.avgRawChrCov, v.avgMapChrCov))
Example #19
0
def print_sex(bam):
    """
    Print sex based on chr x ratio
    
    Args:
	bam (str): Path to bam file
    """

    idxstats = pysam.idxstats(bam)
    chr_ratio = []
    # Calculate read / chromosome length ratio per chromosome
    for chr in idxstats[0:24]:
        chr = chr.strip("\n").split("\t")
        chr_length = float(chr[1])
        chr_mapped = float(chr[2])
        ratio = chr_mapped / chr_length
        chr_ratio.append(ratio)

    chr_ratio_std = numpy.std(chr_ratio)
    chr_ratio_mean = numpy.mean(chr_ratio)

    chr_x = idxstats[22].strip("\n").split("\t")
    chr_x_ratio = float(chr_x[2]) / float(chr_x[1])

    if (chr_x_ratio > chr_ratio_mean - (2 * chr_ratio_std)) and (chr_x_ratio < chr_ratio_mean + (2 * chr_ratio_std)):
        print "female"
    elif chr_x_ratio < chr_ratio_mean - (2 * chr_ratio_std):
        print "male"
    else:
        print "unkown"
Example #20
0
    def _setup(self, config, temp):
        with open(os.path.join(temp, "contigs.table"), "w") as handle:
            handle.write("ID\tSize\tNs\tHits\n")

            # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
            for line in "".join(pysam.idxstats(self._input_file)).split('\n'):
                line = line.strip()
                if not line:
                    continue

                name, size, hits, _ = line.split('\t')
                name = contig_name_to_plink_name(name)
                if name is None or not (name.isdigit() or name == 'X'):
                    continue

                if int(size) != self._contigs[name]['Size']:
                    raise NodeError("TODO: size mismatch")

                row = {
                    'ID': name,
                    'Size': self._contigs[name]['Size'],
                    'Ns': self._contigs[name]['Ns'],
                    'Hits': hits,
                }

                handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row))

        CommandNode._setup(self, config, temp)
def main(argv):

    fullname = os.path.abspath(argv[1])
    bamfile = pysam.Samfile(fullname)
    header = bamfile.header
    fullname_s = fullname.split("/")

    # Populate info dict
    info = dict()    
    info['idsequencing'] = fullname_s[-2].split("seq")[1]
    info['filename'] = os.path.basename(fullname)
    info['aligner_index'] = fullname_s[-4]
    bam_datetime = b = datetime.datetime.fromtimestamp(os.stat(fullname).st_mtime)
    info['align_datetime'] = bam_datetime.strftime("%Y-%m-%d %H:%M:%S")
    info['aligner'] = header['PG'][0]['PN']
    info['command'] = header['PG'][0]['cl']

    # Compute total number of aligned reads
    stats = pysam.idxstats(fullname)
    stats = [el.split("\t") for el in stats]
    total_reads = 0
    for el in stats:
        total_reads += int(el[2])
    info['total_reads'] = total_reads
 
    # Format dict entries for MySQL
    for i in info.iterkeys():
        info[i] = "'" + str(info[i]) + "'"    

    ## Connect to db
    try:
        conn = mdb.connect('localhost', 'brad', 'Eu23ler1', 'sample_db')
        cur = conn.cursor()
    except mdb.Error, e:
        print "MySQLdb error %d: %s " % (e.args[0] + e.args[1])
Example #22
0
def bamStats(bamfile):
    """ Extract average depths + idxstats data from BAM file, return data frame """
    istats = pysam.idxstats(bamfile)
    result = []
    samfile = pysam.Samfile(bamfile, "rb")
    for x in istats:
        xs = x.replace("\n", "").split("\t")
        rec = {
            "CHROM": xs[0],
            "NT": int(xs[1]),
            "MAPPED": int(xs[2]),
            "UNMAPPED": int(xs[3]),
            "READLEN": 0,
            "COVERAGE": 0.0,
        }
        count = 0
        rls = 0.0        
        try:
            for read in samfile.fetch(xs[0]):
                rls += float(read.rlen)
                count += 1
                if count > 10000:
                    break

            rls /= count
            rec["READLEN"] = rls
            rec["COVERAGE"] = float(rec["MAPPED"] * rec["READLEN"])/float(rec["NT"])
        except:
            pass
        result.append(rec)

    if result:
        return pandas.DataFrame(result, columns=["CHROM", "NT", "MAPPED", "UNMAPPED", "READLEN", "COVERAGE"])
    else:
        return pandas.DataFrame(columns=["CHROM", "NT", "MAPPED", "UNMAPPED", "READLEN", "COVERAGE"])
Example #23
0
def find_coverage(df):
    print "finding coverage of BAM files"

    df['cov'] = None

    for index, row in df.iterrows():
        bam = row['BAM_path']
        temp_cov_output = str(bam) + ".cov"

        stats = pysam.idxstats(bam)
        nreads_mapped = 0
        nreads_unmapped = 0
        total_bp = 0
        for row in stats.split("\n"):
            row.rstrip("\r")
            fields = row.split("\t")
            if len(fields) > 3 and fields[0] != '*':
                total_bp += int(fields[1])
                nreads_mapped += int(fields[2])
                nreads_unmapped += int(fields[3])

        cov = (nreads_unmapped + nreads_mapped) * 150. / total_bp

        df.loc[index, 'cov'] = cov

    return df
def get_num_reads(filename):
    num_reads = 0
    try:
        num_reads = reduce(lambda x, y: x + y,
                           [eval('+'.join(l.rstrip('\n').split('\t')[2:])) for l in pysam.idxstats(filename)])
    except:
        sys.stderr.write("Unable to count reads in file: %s" % filename)
    return num_reads
Example #25
0
def count_all(bamfile):
    #all reads: mapped + unmapped
    #return reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamfile) ])

    #count mapped reads
    return reduce(
        lambda x, y: x + y,
        [int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(bamfile)])
Example #26
0
 def testReturnValueString(self):
     retval = pysam.idxstats(os.path.join(BAM_DATADIR, "ex1.bam"))
     if IS_PYTHON3:
         self.assertFalse(isinstance(retval, bytes))
         self.assertTrue(isinstance(retval, str))
     else:
         self.assertTrue(isinstance(retval, bytes))
         self.assertTrue(isinstance(retval, basestring))
Example #27
0
def test_idxstats_parse():
    bam_filename = os.path.join(BAM_DATADIR, "ex2.bam")
    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    idxstats_string = pysam.idxstats(bam_filename, split_lines=False)
    lines = idxstats_string.splitlines()
    for line in lines:
        splt = line.split("\t")
        _seqname, _seqlen, nmapped, _nunmapped = splt
Example #28
0
 def testReturnValueString(self):
     retval = pysam.idxstats(os.path.join(BAM_DATADIR, "ex1.bam"))
     if IS_PYTHON3:
         self.assertFalse(isinstance(retval, bytes))
         self.assertTrue(isinstance(retval, str))
     else:
         self.assertTrue(isinstance(retval, bytes))
         self.assertTrue(isinstance(retval, basestring))
Example #29
0
 def _get_bam_stats(bam_filepath):
     stats_str = StringIO(pysam.idxstats(bam_filepath))
     col_names = ["ref chrom", "ref len", "mapped", "unmapped"]
     bam_stats_df = pd.read_csv(stats_str,
                                delimiter="\t",
                                names=col_names,
                                index_col="ref chrom")
     return bam_stats_df
Example #30
0
def test_idxstats_parse():
    bam_filename = os.path.join(BAM_DATADIR, "ex2.bam")
    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    idxstats_string = pysam.idxstats(bam_filename, split_lines=False)
    lines = idxstats_string.splitlines()
    for line in lines:
        splt = line.split("\t")
        _seqname, _seqlen, nmapped, _nunmapped = splt
Example #31
0
def getChromsFromBAM(filename):
    chroms = []
    stats = pysam.idxstats(filename)
    for row in stats.split("\n"):
        fields = row.split("\t")
        if fields[0] != '*' and fields[0] != '':
            chroms.append(fields[0])
    return chroms
def getBamReads(bam):
    '''
	get total reads from a bam file
	'''
    return (reduce(
        lambda x, y: x + y,
        [int(l.split('\t')[2])
         for l in pysam.idxstats(bam).split('\n')[0:-1]]))
Example #33
0
def Main():
    args = ParseArg()

    if len(args.data) != len(args.name):
        print >> sys.stderr, "ERROR: Number of data is not the same as number of names!"
        sys.exit(0)

    # store data information
    data = {}
    total_reads = {}
    for i in range(len(args.data)):
        temp_name = args.name[i]
        print >> sys.stderr, "\n Reading data file:" + temp_name + "..."
        total_reads[temp_name] = 0
        if args.format[i] == "bam":
            total_reads[temp_name] = reduce(lambda x, y: x + y, [
                int(l.rstrip('\n').split('\t')[2])
                for l in pysam.idxstats(args.data[i])
            ])
        else:
            Format = "bed"
            for b in TableIO.parse(args.data[i], Format):
                total_reads[temp_name] += 1
                if total_reads[temp_name] % 50000 == 0:
                    print >> sys.stderr, "  reading %d reads..\r" % (
                        total_reads[temp_name]),
        data[temp_name] = DBI.init(args.data[i], args.format[i])

    output = open(args.output, 'w')

    Input = open(args.input, 'r')
    lines = Input.read().split("\n")

    # header
    header = ["chr", "start", "end", "type", "name", "subtype", "count"
              ] + data.keys()
    print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2]
                               for g in header) + "\tinteraction\tp-value"

    num = 0
    print >> sys.stderr, "Start process interactions:"
    for l in lines:
        if l.strip() == '': continue
        l = l.strip().split('\t')
        num = num + 1
        if l[0] == "chrM" or l[7] == "chrM": continue
        C1 = Bed([l[0], int(l[1]), int(l[2])])
        C2 = Bed([l[7], int(l[8]), int(l[9])])
        rpkm1 = "\t".join(
            str(f) for f in
            [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()])
        rpkm2 = "\t".join(
            str(f) for f in
            [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()])
        print >> output, "\t".join(
            str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]])
        if num % 1000 == 0:
            print >> sys.stderr, "  Output interaction: %d\r" % (num),
def calculate_samples(bamfile, count, ref_prefix=None):
    refcounts = dict()
    for s in pysam.idxstats(bamfile).split("\n"):
        tok = s.rstrip().split("\t")
        if ref_prefix is not None and tok[0].startswith(ref_prefix) == False:
            continue
        refcounts[tok[0]] = int(tok[2])
    coef = (count * 1.0) / sum(refcounts.values())
    return dict((k, (v, int(np.round(v * coef)))) for k, v in refcounts.items())
Example #35
0
def countReadsInBAM(filename):
    stats = pysam.idxstats(filename)
    nreads = 0
    for row in stats.split("\n"):
        row.rstrip("\r")
        fields = row.split("\t")
        if len(fields) > 2 and fields[0] != '*':
            nreads += int(fields[2])
    return nreads
def get_total_reads(bam_filename):
    idxstats = pysam.idxstats(bam_filename).split('\n')
    tot = 0
    for l in idxstats:
        if not l:
            continue
        ele = l.split('\t')
        tot += int(ele[-2])
    return tot
Example #37
0
def test_idxstats_parse():
    bam_filename = "./pysam_data/ex2.bam"
    idxstats_string = pysam.idxstats(
        bam_filename, split_lines=False
    )  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    lines = idxstats_string.splitlines()
    for line in lines:
        splt = line.split("\t")
        _seqname, _seqlen, nmapped, _nunmapped = splt
Example #38
0
def calculate_samples(bamfile, count, ref_prefix=None):
    refcounts = dict()
    for s in pysam.idxstats(bamfile).split("\n"):
        tok = s.rstrip().split("\t")
        if ref_prefix is not None and tok[0].startswith(ref_prefix) == False:
            continue
        refcounts[tok[0]] = int(tok[2])
    coef = (count * 1.0) / sum(refcounts.values())
    return dict(
        (k, (v, int(np.round(v * coef)))) for k, v in refcounts.items())
Example #39
0
 def _method_pysam(self, *args, **kwargs):
     import pysam
     # index the bam file
     pysam.index(self.infile)
     # create count table
     with open(self.outfile, 'wt') as out:
         out.write("Reference sequence name\tSequence length\t"
                   "Mapped reads\tUnmapped reads{}".format(os.linesep))
         for line in pysam.idxstats(self.infile):
             out.write(line)
Example #40
0
def bam_total_reads(bam_fname):
    """Count the total number of mapped reads in a BAM file.

    Uses the BAM index to do this quickly.
    """
    lines = pysam.idxstats(bam_fname)
    tot_mapped_reads = 0
    for line in lines:
        _seqname, _seqlen, nmapped, _nunmapped = line.split()
        tot_mapped_reads += int(nmapped)
    return tot_mapped_reads
Example #41
0
def getTotalReads(bam):
    totalReads = 0
    perChromCount = {}
    stats = pysam.idxstats(bam)
    for line in stats.split('\n'):
        tokenized = line.split()
        if len(tokenized) == 0 or tokenized[0] == "*": continue
        c = int(tokenized[2]) + int(tokenized[3])  # mapped + unmapped reads
        perChromCount[tokenized[0]] = c
        totalReads += c
    return totalReads, perChromCount
Example #42
0
def creatChromeSize(bamFileName):
    preffixName, suffixName = os.path.splitext(bamFileName)
    tmpChromeSizeFilename = preffixName + ".chromesize"
    ftmp = open(tmpChromeSizeFilename, "w")

    for line in pysam.idxstats(bamFileName).strip().split('\n'):
        line = line.strip().split()
        if line[0] != "*":
            ftmp.write(line[0] + "\t" + line[1] + "\n")
    ftmp.close()
    return tmpChromeSizeFilename
Example #43
0
def bam_total_reads(bam_fname):
    """Count the total number of mapped reads in a BAM file.

    Uses the BAM index to do this quickly.
    """
    lines = pysam.idxstats(bam_fname)
    tot_mapped_reads = 0
    for line in lines:
        _seqname, _seqlen, nmapped, _nunmapped = line.split()
        tot_mapped_reads += int(nmapped)
    return tot_mapped_reads
Example #44
0
def normalization(options):
    """
    find total number of mapped reads for each chromosome
    to be used as a scaling factor
    when comparing between samples
    """
    stats = pysam.idxstats(options.file)
    norms = {}
    for i in stats:
        norms[i.rsplit()[0]] = int(i.rsplit()[2])
    return norms
Example #45
0
def idxstats(bam_fname, drop_unmapped=False):
    """Get chromosome names, lengths, and number of mapped/unmapped reads.

    Use the BAM index (.bai) to get the number of reads and size of each
    chromosome. Contigs with no mapped reads are skipped.
    """
    handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
    table = pd.read_table(handle, header=None,
                          names=['chromosome', 'length', 'mapped', 'unmapped'])
    if drop_unmapped:
        table = table[table.mapped != 0].drop('unmapped', axis=1)
    return table
Example #46
0
def CountRandom(BamFile):
	samIdxStats = pysam.idxstats(BamFile)
	samfile = pysam.Samfile(BamFile,"rb")
	TotalMapped = samfile.mapped
	samfile.close()
	countAlign = 0
	List = GetChromoList()
	for stat in samIdxStats:
		if stat.split()[0] in List:
			MappedforChromosome = stat.split()[2]
			countAlign = countAlign+long(MappedforChromosome)
	RandAlign = TotalMapped-countAlign
	return [BamFile,{"Random":RandAlign}]
Example #47
0
 def _init_read_number(self, bamFile):
     """Compute number of reads and number of mapped reads for CoverageSet"""
     # XXX ToDo add number of mapped reads in all cases
     # try:
     from distutils.version import LooseVersion
     if LooseVersion("0.9.0") <= LooseVersion(pysam.__version__):
         a = pysam.idxstats(bamFile)
         mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n'))-1]])
         unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n'))-1]])
         self.reads = mapped_reads + unmapped_read
         self.mapped_reads = mapped_reads
     else:
         self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamFile)])
         self.mapped_reads = None
Example #48
0
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min((len(record.sequence))
                     for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)"
                      % (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!'
                       % (handle.filename,))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible."
                          % (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
Example #49
0
def verify_chrom_in_paths(genome_path, bamfile1, bamfile2, chrom_sizes):
    """Check whether the chromsome info overlap in bamfiles, genome path and chrom size path"""
    chrom_bams = set()
    chrom_genome = set()
    chrom_chrom_sizes = set()
    #check bam files
    try:
        if pysam.__version__ == '0.9.0':
            chrom_bams_1 = set([el.split('\t')[0] for el in pysam.idxstats(bamfile1).split('\n')[:len(pysam.idxstats(bamfile1).split('\n'))-1]])
            chrom_bams_2 = set([el.split('\t')[0] for el in pysam.idxstats(bamfile2).split('\n')[:len(pysam.idxstats(bamfile2).split('\n'))-1]])
        else:
            chrom_bams_1 = set(map(lambda x: x.split('\t')[0], pysam.idxstats(bamfile1)))
            chrom_bams_2 = set(map(lambda x: x.split('\t')[0], pysam.idxstats(bamfile2)))
    except:
        return True
            
    chrom_bams = chrom_bams_1 & chrom_bams_2
    #check chrom_sizes
    with open(chrom_sizes) as f:
        for line in f:
            line = line.split('\t')
            if line[0] not in chrom_chrom_sizes:
                chrom_chrom_sizes.add(line[0])
    
    tmp = chrom_bams & chrom_chrom_sizes
    if len(tmp) == 0:
        return False
    
    #check genome
    for s in FastaReader(genome_path):
        if s.name not in chrom_genome:
            chrom_genome.add(s.name)
	    if s.name in tmp: #one overlap is sufficient
		return True
    
    return len(chrom_bams & chrom_genome & chrom_chrom_sizes) >= 1
def get_chrom_lengths(path_to_bam):
    '''
    Uses pysam to retrieve chromosome sizes form bam.
    Useful helper to use with some pybedtools functions (e.g. coverage), when a bam was mapped with custom genome not available in UCSC.
    Input: path to bam file (should be indexed)
    Output: dictionary.
    Example output:
    {'chr4': (0, 1351857), 'chr3L': (0, 24543557), 'chr2L': (0, 23011544), '*': (0, 0), 'chrX': (0, 22422827), 'chr2R': (0, 21146708), 'chr3R': (0, 27905053)}
    '''
    idx = pysam.idxstats(path_to_bam).splitlines()
    chromsizes = {}
    for element in idx:
        stats = element.split("\t")
        chromsizes[stats[0]] = (0, int(stats[1]))
    return chromsizes
Example #51
0
 def _init_read_number(self, bamFile):
     """Compute number of reads and number of mapped reads for CoverageSet"""
     # XXX ToDo add number of mapped reads in all cases
     try:
         if pysam.__version__ == '0.9.0':
             a = pysam.idxstats(bamFile)
             mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n'))-1]])
             unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n'))-1]])
             self.reads = mapped_reads + unmapped_read
             self.mapped_reads = mapped_reads
         else:
             self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamFile)])
             self.mapped_reads = None
     except:
         self.reads = None
         self.mapped_reads = None
Example #52
0
def bam_total_reads(bam_handle, chroms_to_ignore):
    """Count the total number of mapped reads in a BAM file, filtering
    the chromosome given in chroms_to_ignore list
    """
    if chroms_to_ignore:
        import pysam

        lines = pysam.idxstats(bam_handle.filename)
        tot_mapped_reads = 0
        for line in lines:
            chrom, _len, nmapped, _nunmapped = line.split("\t")
            if chrom not in chroms_to_ignore:
                tot_mapped_reads += int(nmapped)

    else:
        tot_mapped_reads = bam_handle.mapped

    return tot_mapped_reads
Example #53
0
def write_table(filename, outfile):
    """
    Function that create a count table using pysam. First index the BAM file,
    then count reads using the function idxstats from pysam, and output a count
    table.

    Args :
        filename [STR] : BAM file to count
        outfile [STR] : count table name

    No Returns 
    """
    # index the bam file
    pysam.index(filename)
    # create count table
    table = pysam.idxstats(filename)
    # write the count table
    with open(outfile, 'wt') as out:
        for line in table:
            out.write(line)
def get_chromosomes_info(bam_path):
    # Check if there is an index file, create one if there isn't
    if not os.path.isfile(bam_path + ".bai"):
        pysam.index(bam_path)
        logging.info('No BAM index file was found, new index was generated : `{}`'.format(bam_path + ".bai"))
    # Take chromosome data from BAM index:
    # (ref.seq. name, ref.seq. length, number of mapped reads and number of unmapped reads)
    chromosomes_info = []
    logging.info('Collecting information about sample from .bai file: '
                 '[ref.seq. name, ref.seq. length, number of mapped and unmapped reads]')
    logging.info("\nGenome ID {} \nEstimated mappability {}".format('?', '?'))
    try:
        for chr in pysam.idxstats(bam_path):
            chromosomes_info.append(chr.split("\t")[:-1])
    # Last line is unmapped reads, we don't need them
        chromosomes_info.pop()
    except:
        logging.error("\nPROBLEM WITH BAM FILE OR pysam.idxstats() COMMAND\nYour BAM file {} probably is not sorted."
                      "\n\nTo sort it with samtools use comand: \n'samtools sort {} {}'"
                      .format(bam_path, bam_path, bam_path[:-3] + 'sorted'))
        sys.exit(1)
    # print(chromosomes_info)
    return chromosomes_info
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None):
    blacklisted = 0
    if blackListFileName is None:
        return blacklisted

    import pysam
    import deeptools.mapReduce as mapReduce

    # Get the chromosome lengths
    chromLens = {}
    lines = pysam.idxstats(bam_handle.filename)
    if type(lines) is str:
        lines = lines.strip().split('\n')
    for line in lines:
        chrom, _len, nmapped, _nunmapped = line.split('\t')
        chromLens[chrom] = int(_len)

    bl = mapReduce.BED_to_interval_tree(open(blackListFileName, "r"))
    for chrom in bl.keys():
        if not chroms_to_ignore or chrom not in chroms_to_ignore:
            for reg in bl[chrom].find(0, chromLens[chrom]):
                blacklisted += bam_handle.count(reference=chrom, start=reg.start, end=reg.end)

    return blacklisted
Example #56
0
def Main():
    args=ParseArg()
    
    if len(args.data)!=len(args.name):
        print >> sys.stderr, "ERROR: Number of data is not the same as number of names!"
        sys.exit(0)

    # store data information
    data={}
    total_reads={}
    for i in range(len(args.data)):
        temp_name=args.name[i]
        print >> sys.stderr, "\n Reading data file:"+temp_name+"..."
        total_reads[temp_name]=0
        if args.format[i]=="bam":
            total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i])])
        else:
            Format="bed"
            for b in TableIO.parse(args.data[i],Format):
                total_reads[temp_name]+=1
                if total_reads[temp_name]%50000==0:
                    print >> sys.stderr, "  reading %d reads..\r"%(total_reads[temp_name]),
        data[temp_name]=DBI.init(args.data[i],args.format[i])
        
    
    output=open(args.output,'w')

    Input=open(args.input,'r')
    lines=Input.read().split("\n")

    # header
    header=["chr","start","end","type","name","subtype","count"]+data.keys()
    print >> output, "\t".join(g+"_%d"%(f) for f in [1,2] for g in header)+"\tinteraction\tp-value"

    num=0    
    print >> sys.stderr, "Start process interactions:"
    for l in lines:
        if l.strip()=='': continue
        l=l.strip().split('\t')
        num=num+1
        if l[0]=="chrM" or l[7]=="chrM": continue
        C1=Bed([l[0],int(l[1]),int(l[2])])
        C2=Bed([l[7],int(l[8]),int(l[9])])
        rpkm1="\t".join (str(f) for f in [RPKM(C1,data[n],total_reads[n],n) for n in data.keys()])
        rpkm2="\t".join (str(f) for f in [RPKM(C2,data[n],total_reads[n],n) for n in data.keys()])
        print >> output, "\t".join(str(f) for f in l[:7]+[rpkm1]+l[7:14]+[rpkm2,l[14],l[15]])
	if num%1000==0:
            print >> sys.stderr, "  Output interaction: %d\r"%(num),
def count_telomeric_reads(bamfile, q):
	# generate Telomere reads file name
	telofile = bamfile.replace(options.bamdir,options.outdir).replace(".bam","_TelomericReads.sam")

	# check if the file was already generated
	if not os.path.exists(telofile):
		# print("---- Processing BAM file: "+bamfile)
		# extract telomeric reads and write to file
		cmd = options.sambamba+" view "+bamfile+" -t "+ str(options.nr_cpus) +" | LC_ALL=C grep -E \"" + "TTAGGG"*options.repsize +"|"+ "CCCTAA"*options.repsize + "\"" + " > " + telofile
		print("++++ Generating SAM file: "+telofile)
		os.system(cmd)

	# count total number of reads
	total_rc = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamfile) ])

	sleep(1)

	telomere_rc = 0
	if os.path.exists(telofile):
		# count number of telomeric reads by line count
		telomere_rc = sum(1 for line in open(telofile,'r'))
	else:
		print("Something went wrong with BAM file: "+bamfile)

	# return results
	result = [str(bamfile.split("/")[-1].split("_")[0]), str(total_rc), str(telomere_rc), str((telomere_rc/(total_rc*1.0))*100000.0)]
	q.put(result)
	return(result)
Example #58
0
def bam_statistics(bam_filename):
	stats = pysam.idxstats(bam_filename)
	del stats[-1] # * 0 0 0 0 ...?
	mapped_reads = sum([int(el.split("\t")[2]) for el in stats])
	notmapped_reads = sum([int(el.split("\t")[3]) for el in stats])
	return {'mapped':mapped_reads, 'notmapped':notmapped_reads, 'all':mapped_reads+notmapped_reads}
Example #59
0
def test_idxstats_parse_split_lines():
    bam_filename = "./pysam_data/ex2.bam"
    lines = pysam.idxstats(bam_filename, split_lines=True)  # Test pysam 0.8.X style output, which returns a list of lines
    for line in lines:
        _seqname, _seqlen, nmapped, _nunmapped = line.split()
Example #60
0
    def coverage_from_bam(self, bam_file, read_size = 200, binsize = 100, stepsize = 50, rmdup = True, mask_file = None):
        """Return list of arrays describing the coverage of each genomicRegions from <bam_file>. 
        Consider reads in <bam_file> with a extension size of <read_size>.
        Remove duplicates (read with same position) with rmdup=True (default).
        Divide the genomic regions in bins with a width of <binsize> and use <stepsize> to smooth the signal."""
        self.binsize = binsize
        self.stepsize = stepsize
        
        bam = pysam.Samfile(bam_file, "rb" )
        for read in bam.fetch():
            read_size += read.rlen
            break
        self.mapped_reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:3]) ) for l in pysam.idxstats(bam_file) ])
        self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bam_file) ])
        #print("Loading reads of %s..." %self.name, file=sys.stderr)
        
        #check whether one should mask
        next_it = True
        if mask_file is not None and os.path.exists(mask_file):
            mask = True
            f = open(mask_file, 'r')
            c_help, s_help, e_help = self.genomicRegions.sequences[0].chrom, -1, -1
        else:
            mask = False
        
        chrom_regions = [r.chrom for r in self.genomicRegions.sequences] #chroms by regions
        
        for region in self.genomicRegions:
            cov = [0] * (len(region) / stepsize)

            positions = []
            j = 0
            read_length = -1
            try:
                for read in bam.fetch(region.chrom, max(0, region.initial-read_size), region.final+read_size):
                    
                    j += 1
                    read_length = read.rlen 
                    if not read.is_unmapped:
                        pos = read.pos - read_size if read.is_reverse else read.pos
                        pos_help = read.pos - read.qlen if read.is_reverse else read.pos
                        
                        #if position in mask region, then ignore
                        if mask:
                            while next_it and c_help not in chrom_regions: #do not consider this deadzone
                                c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline())
                            if c_help != -1 and chrom_regions.index(region.chrom) >= chrom_regions.index(c_help): #deadzones behind, go further
                                while next_it and c_help != region.chrom: #get right chromosome
                                    c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline())
                            while next_it and e_help <= pos_help and c_help == region.chrom: #check right position
                                c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline())
                            if next_it and s_help <= pos_help and c_help == region.chrom:
                                continue #pos in mask region
                        
                        positions.append(pos)
            except ValueError:
                pass
            if rmdup:
                positions = list(set(positions))
            positions.sort()
            positions.reverse()
            
            i = 0
            while positions:
                win_s = max(0, i * stepsize - binsize*0.5) + region.initial
                win_e = i * stepsize + binsize*0.5 + region.initial 
                c = 0
                taken = []
                while True:
                    s = positions.pop()
                    
                    taken.append(s)
                    if s < win_e: #read within window
                        c += 1
                    if s >= win_e or not positions:
                        taken.reverse()
                        for s in taken:
                            if s + read_size + read_length >= win_s: #consider read in next iteration
                                positions.append(s)
                            else:
                                break #as taken decreases monotonously
                        taken = []
                        break
                
                if i < len(cov):
                    cov[i] = c

                i += 1

            self.coverage.append(np.array(cov))

        self.coverageorig = self.coverage[:]