Example #1
0
def get_raw_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift) = arguments

    bam = Samfile(reads_file, "rb")
    signal = np.zeros(window_size)

    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        for read in bam.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal[cut_site - p1] += 1.0

    return signal
def callbase(bamfile, snpsites, out):
    BF = Samfile(bamfile, 'rb') #open your bam file
    SF = open(snpsites, 'r')    #the file contain snp sites info
    RF = open(out, 'w')         #resulte file
    RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
    for i in SF:
        if i.startswith('#'):
            continue
        else:
            line = ParseSNPsitesLine(i)
            vcf_pos = line.pos-1 #change 1-base to 0-based
            vcf_refname = line.chrom
            print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
            At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
            for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
                if i.pos == vcf_pos:
                    vcf_Rbase = line.Rbase
                    vcf_Abase = line.Abase
                    for j in i.pileups:
                        yourbase = j.alignment.seq[j.qpos]
                        if yourbase == 'A': At += 1
                        elif yourbase == 'T': Tt += 1
                        elif yourbase == 'C': Ct += 1
                        elif yourbase == 'G': Gt += 1
                        elif yourbase == 'N': Nt += 1
                        else: othert += 1
        RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
    BF.close()
Example #3
0
def single_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for single_sam in to_process:
        sam = Samfile(single_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= identity_threshold:
                            match.setdefault(query_name, set())
                            match[query_name].add(ref_name)
        sam.close()
    return match
Example #4
0
 def __init__(self, file_name):
     """ 
     Initializes GenomicSignal.
     """
     self.file_name = file_name
     self.sg_coefs = None
     self.bam = Samfile(file_name, "rb")
Example #5
0
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
Example #6
0
def test_against_fixtures():

    # load fixtures from numpy array
    bampath = "fixture/test.bam"
    fastapath = "fixture/ref.fa"
    archive = "fixture/regression.npz"

    testset = np.load(archive)

    for q in stats_types:
        if q in stats_types_withref:
            x = getattr(pysamstats, "load_" + q)(Samfile(bampath),
                                                 fafile=fastapath)
        else:
            x = getattr(pysamstats, "load_" + q)(Samfile(bampath))

        # loop through all fields
        for key in testset[q].dtype.names:
            expect = testset[q][key]
            actual = x[key]
            try:
                np.testing.assert_array_equal(expect, actual, err_msg=key)
            except AssertionError:
                print(expect[expect != actual])
                print(actual[expect != actual])
                raise
Example #7
0
def test_pileup_pad():
    kwargs_nopad = {'chrom': 'Pf3D7_01_v3',
                    'start': 0,
                    'end': 20000,
                    'one_based': False,
                    'pad': False}
    kwargs_pad = {'chrom': 'Pf3D7_01_v3',
                  'start': 0,
                  'end': 20000,
                  'one_based': False,
                  'pad': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_nopad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_nopad)
        eq_(924, a['pos'][0])
        eq_(9935, a['pos'][-1])
        # test pad
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_pad)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_pad)
        eq_(0, a['pos'][0])
        eq_(19999, a['pos'][-1])
        assert np.all(np.diff(a['pos']) == 1)
Example #8
0
def test_pileup_truncate():
    kwargs_notrunc = {'chrom': 'Pf3D7_01_v3',
                      'start': 2000,
                      'end': 2100,
                      'one_based': False,
                      'truncate': False}
    kwargs_trunc = {'chrom': 'Pf3D7_01_v3',
                    'start': 2000,
                    'end': 2100,
                    'one_based': False,
                    'truncate': True}
    for f, needs_ref in pileup_functions:
        debug(f.__name__)
        # test no truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_notrunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_notrunc)
        debug(a[:5])
        eq_(1952, a['pos'][0])
        eq_(2154, a['pos'][-1])
        # test truncate
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs_trunc)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs_trunc)
        eq_(2000, a['pos'][0])
        eq_(2099, a['pos'][-1])
Example #9
0
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
Example #10
0
def parse_bam_differential(afn, bfn, regs, step):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end). No normalization is done at this step.
    """
    abam = Samfile(str(afn), "rb")
    bbam = Samfile(str(bfn), "rb")
    acount = []
    bcount = []
    oldchr = "chr1"
    for reg in regs:
        chr, start, end = reg[:3]
        if chr != oldchr:
            log("files: %s - %s : %s counted" % (afn, bfn, oldchr))
            oldchr = chr
        # this could be improved
        for s in xrange(start, end, step):
            e = s + step
            an = abam.count(chr, s, e)
            bn = bbam.count(chr, s, e)
            acount.append(an)
            bcount.append(bn)
        acount.append(-1)
        bcount.append(-1)
    log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr))
    return acount, bcount
Example #11
0
def test_binned_pad_wg():
    expected = stat_coverage_binned_refimpl(
        Samfile('fixture/test.bam'),
        Fastafile('fixture/ref.fa'))

    actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'),
                                             Fastafile('fixture/ref.fa'))
    compare_iterators(expected, actual)
    kwargs = {'window_size': 200,
              'window_offset': 100}
    for f, needs_ref in binned_functions:
        debug(f.__name__)
        if needs_ref:
            a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'),
                  **kwargs)
        else:
            a = f(Samfile('fixture/test.bam'), **kwargs)
        assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3',
                                           b'Pf3D7_03_v3']
        eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0])
        eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0])
        eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1])
        eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0])
        eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
Example #12
0
def bam_fill_seq(args):
    """ Fill empty sequence with known seqs
    """
    if not args.source_bam:
        source_bam = args.bam
    else:
        source_bam = args.source_bam
    logging.info('Loading samfile: %s', source_bam)
    src_seqs = {1: {}, 2: {}}

    src = pysam.Samfile(source_bam)
    with src:
        for rec in src:
            if rec.is_supplementary:  # skip supplementary alignment
                continue
            if rec.is_secondary:  # skip supplementary alignment
                continue
            if rec.query_sequence is None:  # empty
                continue
            if rec.is_read2:
                src_seqs[2][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)
            else:
                src_seqs[1][rec.qname] = (rec.query_sequence,
                                          rec.query_qualities, rec.is_reverse)

    logging.info('Loaded read1 : %s', len(src_seqs[1]))
    logging.info('Loaded read2 : %s', len(src_seqs[2]))

    sam = Samfile(args.bam)
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output, mode=mode, template=sam)

    if args.region:
        it = sam.fetch(region=args.region)
    else:
        it = sam

    for rec in it:
        qname = rec.qname
        if rec.query_sequence is None:  # only fill when empty
            ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname)
            if ret is not None:
                seq, qual, is_rev = ret
                if is_rev != rec.is_reverse:
                    seq = dna_revcomp(seq)
                    if qual is not None:
                        qual = list(reversed(qual))
                cigar = Cigar(rec.cigartuples)
                seq = cigar.hard_clip_seq(seq)
                if qual is not None:
                    qual = cigar.hard_clip_seq(qual)
                rec.query_sequence = seq  # refill
                rec.query_qualities = qual

        out.write(rec)
def bam_read_id(url):
    '''Read first read id out of a remote bam file.

    Note: requires a patched version of pysam
    '''
    stream = Samfile(url, 'rb')
    read = stream.next()
    return read.qname
Example #14
0
def process_bam(abam, bbam, mismatches=0):
    """Removes duplicate reads characterized by their UMI at any given start location.

    Args:
        abam (str): Input bam with potential duplicate UMIs
        bbam (str): Output bam after removing duplicate UMIs
        mismatches (Optional[int]): Allowable edit distance between UMIs
    """
    is_indexed(abam)
    with Samfile(abam, 'rb') as in_bam, Samfile(bbam, 'wb', template=in_bam) as out_bam:

        for chrom in in_bam.references:
            print("processing chromosome", chrom, file=sys.stderr)

            umi_idx = defaultdict(set)
            read_counts = Counter()

            for read in in_bam.fetch(chrom):
                if read.is_unmapped:
                    continue

                # get the iupac umi sequence
                try:
                    umi = umi_from_name(read.qname)
                except UMINotFound:
                    print("You may be processing alignments that haven't been annotated with UMIs!", file=sys.stderr)
                    raise

                # get actual read start
                # read.pos accounts for 5' soft clipping
                if read.is_reverse:
                    # read.alen alignment length accounting for 3' soft clipping
                    # UMIs are then compared to reads with the same start
                    read_start = read.pos + read.alen
                else:
                    read_start = read.pos

                # add count for this start; counts all reads
                read_counts[read_start] += 1

                # check if UMI seen
                if umi in umi_idx[read_start]:
                    continue

                # check if UMI is similar enough to another that has been seen
                if mismatches > 0 and is_similar(umi, umi_idx[read_start], mismatches):
                    # do not count; group similar UMIs into one
                    continue

                # keep track of unique UMIs - set eliminates duplicates
                umi_idx[read_start].add(umi)

                out_bam.write(read)

            # process before and after counts over chrom
            for start, before_count in sorted(read_counts.items()):
                print(chrom, start, start + 1, before_count, len(umi_idx[start]), sep="\t")
Example #15
0
def bam_variant_aln(args):
    samfile = Samfile(args.bam)
    for rec in args.vcf:
        fp = open(vcf)
        reader = pyvcf.Reader(fp)
        self.positions = []

    for rec in samfile.fetch(vcf):
        samfile.getrname(rec.tid)
        rec
Example #16
0
def _open_alignment_file(in_path):
    """Returns alignment file handle for BAM, SAM, or CRAM"""
    input_extension = in_path.split(".")[-1].lower()
    if input_extension == "bam":
        alignment_file = Samfile(in_path, "rb")
    elif input_extension == "sam":
        alignment_file = Samfile(in_path, "r")
    elif input_extension == "cram":
        alignment_file = Samfile(in_path, "rc")
    return alignment_file
Example #17
0
    def __init__(self, file_name):
        """ 
        Initializes BamFile.

        Variables:
        bam -- Pysam's bam representation.
        sg_coefs -- Savitzky-Golay coefficients (list). Should be loaded after class initialization.
        """
        self.file_name = file_name
        self.bam = Samfile(file_name, "rb")
        self.sg_coefs = None
Example #18
0
def parse_bam_absolute(fn, regs):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end) and normalizes the count by the segment's length.
    """
    bam = Samfile(str(fn), "rb")
    count = []
    for reg in regs:
        chr, start, end = reg[:3]
        n = bam.count(chr, start, end)
        count.append(float(n) / (end - start))
    return count
Example #19
0
def parse_bam_absolute(fn, regs):
    """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping 
    onto a segment (chr, start, end) and normalizes the count by the segment's length.
    """
    bam = Samfile(str(fn), "rb")
    count = []
    for reg in regs:
        chr, start, end = reg[:3]
        n = bam.count(chr, start, end)
        count.append(float(n) / (end - start))
    return count
Example #20
0
def test_write_hdf5_chrom_dtype():

    contig_label = "AS2_scf7180000696055"
    bampath = "fixture/longcontignames.bam"

    dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}]
    alignments = [Samfile(bampath), Samfile(bampath), bampath]
    results = [len(contig_label), 20, 20]
    labels = [contig_label, contig_label, contig_label]

    for arg in zip(dtypes, alignments, results, labels):
        assert check_write_hdf5_chrom_dtype(arg)
Example #21
0
def create_table(half_ext, feature_summit_file_name, bam_names, bam_counts, bam_list, output_file_name):

  # Initialization
  outLoc = "/".join(output_file_name.split("\t")[:-1]) + "/"
  command = "mkdir -p "+outLoc
  os.system(command)

  # Allowed chromosomes
  chrList = ["chr"+str(e) for e in range(1,23)+["X"]]

  # Fetching regions
  featureSummitFile = open(feature_summit_file_name,"r")
  regionList = []
  for line in featureSummitFile:
    ll = line.strip().split("\t")
    if(ll[0] not in chrList): continue
    region = [ll[0], int(ll[1])-half_ext, int(ll[2])+half_ext]
    if(int(region[1]) < 0): continue
    regionList.append(region)
  featureSummitFile.close()

  # Creating table
  matrix = []
  for i in range(0,len(bam_list)):
    inputBamFileName = bam_list[i]
    correctFactor = int(bam_counts[i])/1000000
    extension = inputBamFileName.split(".")[-1]
    if(extension == "bam"):
      bamFile = Samfile(inputBamFileName,"rb")
      vec = []
      for region in regionList:
        try: bamSignal = fetchSignal(bamFile, region) / correctFactor
        except Exception: bamSignal = 0
        vec.append(bamSignal)
    elif(extension == "bw" or extension == "bigwig"):
      bamFile = pyBigWig.open(inputBamFileName)
      vec = []
      for region in regionList:
        try: bamSignal = fetchSignalBw(bamFile, region) / correctFactor
        except Exception: bamSignal = 0
      vec.append(bamSignal)
    else: print("The tool supports only BAM or BIGWIG files.")
    matrix.append(vec)
    bamFile.close()
  outputFile = open(output_file_name,"w")
  outputFile.write("\t".join(bam_names)+"\n")
  for j in range(0,len(matrix[0])):
    vec = []
    for i in range(0,len(matrix)):
      try: vec.append(str(matrix[i][j]))
      except Exception: vec.append("NA")
    outputFile.write("\t".join(vec)+"\n")
  outputFile.close()
Example #22
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG",
                        add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location,
                                "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" +
                           str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e)
                                      for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location,
                                   "{}.bw".format(args.output_prefix))
        os.system(" ".join([
            "wigToBigWig", output_fname, chrom_sizes_file, bw_filename,
            "-verbose=0"
        ]))
        os.remove(output_fname)
Example #23
0
    def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual):
        startTime = Helper.getTime()
        minDistance = int(minDistance)
        counter = 0
        j = 0
        num_lines = len(self.variantDict)
        Helper.info(
            " [%s] remove Missmatches from the first %s bp from read edges" %
            (startTime.strftime("%c"), str(minDistance)), self.logFile,
            self.textField)

        bamFile = Samfile(bamFile, "rb")

        for varKey in self.variantDict.keys():
            variant = self.variantDict[varKey]

            counter += 1
            if counter % 10000 == 0:
                Helper.status('%s mm parsed ' % counter, self.logFile,
                              self.textField, "grey")

            keepSNP = False
            varPos = variant.position - 1
            iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                  variant.position)
            #walks up the region wich overlap this position
            for x in iter:
                if x.pos == varPos:
                    for pileupread in x.pileups:  #walk through the single reads
                        if not pileupread.is_del and not pileupread.is_refskip:
                            distance = abs(
                                pileupread.alignment.alen -
                                pileupread.query_position
                            ) if pileupread.alignment.is_reverse else pileupread.query_position
                            if distance >= minDistance:
                                #check readBase and Base Quality
                                if pileupread.alignment.query_sequence[
                                        pileupread.
                                        query_position] == variant.alt and pileupread.alignment.query_qualities[
                                            pileupread.
                                            query_position] >= minBaseQual:
                                    #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    keepSNP = True

            if keepSNP == False:
                j += 1
                del self.variantDict[varKey]

        Helper.status('%s of %svariants were deleted' % (j, num_lines),
                      self.logFile, self.textField, "black")
        Helper.printTimeDiff(startTime, self.logFile, self.textField)
        bamFile.close()
def main():

    bam  = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    for al in bam:
        chrom = bam.getrname(al.rname)
        start = al.pos
        end   = al.aend
        name  = al.qname 
        for hit in rmsk.search(chrom, start, end):
            print chrom, start, end, name,
            print hit.chrom, hit.start, hit.end, hit.name
Example #25
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")

    #Iterates over each read instead of each contig
    outputs = defaultdict(list)
    #import ipdb; ipdb.set_trace()
    for aln in samfile.fetch(until_eof = True):
        ref = samfile.getrname(aln.tid)
        outputs[ref].append(aln)

    for ref, alns in outputs.iteritems():
        print_reads(alns, ref, samfile.header)
Example #26
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    f = Samfile(args[0])
    header = f.header
    f.close()

    reflen = header['SQ'][0]['LN']

    BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header)

    return 0
Example #27
0
def paired_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for paired_sam in to_process:
        r1_match = {}
        r2_match = {}
        sam = Samfile(paired_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    # print query_name, align_len, query_aligned_len
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= 90:
                            if align.is_read1:
                                r1_match.setdefault(query_name, {})
                                r1_match[query_name].setdefault(ref_name, [])
                                r1_match[query_name][ref_name].append(
                                    paired_perc_id)
                            if align.is_read2:
                                r2_match.setdefault(query_name, {})
                                r2_match[query_name].setdefault(ref_name, [])
                                r2_match[query_name][ref_name].append(
                                    paired_perc_id)
        sam.close()
        for query in set(r1_match.keys()).intersection(set(r2_match.keys())):
            for ref in set(r1_match[query].keys()).intersection(
                    r2_match[query].keys()):
                average_perc_id = calcola_media(
                    [max(r1_match[query][ref]),
                     max(r2_match[query][ref])])
                if average_perc_id >= identity_threshold:
                    match.setdefault(query, set())
                    match[query].add(ref)
    return match
Example #28
0
    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = Samfile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)
Example #29
0
def main():

    bam = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")

    # Example 1:
    #    Method: IntervalFile.all_hits()
    #    Report _all_ of the rmsk features that overlap with the BAM alignment
    for al in bam:
        strand = "+"
        if al.is_reverse: strand = "-"
        i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand)

        for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75):
            print "\t".join(str(x) for x in [i, hit])
Example #30
0
def compare_stats(impl, refimpl):
    # no read filters
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 0,
              'end': 2000,
              'one_based': False}
    expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
    actual = impl(Samfile('fixture/test.bam'), **kwargs)
    compare_iterators(expected, actual)
    # read filters
    kwargs['min_mapq'] = 1
    kwargs['no_dup'] = True
    expected = refimpl(Samfile('fixture/test.bam'), **kwargs)
    actual = impl(Samfile('fixture/test.bam'), **kwargs)
    compare_iterators(expected, actual)
def main():

    bam  = Samfile("bedtools/tests/data/NA18152.bam", "rb")
    rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed")
    
    # Example 1:
    #    Method: IntervalFile.all_hits()
    #    Report _all_ of the rmsk features that overlap with the BAM alignment
    for al in bam:
        strand = "+"
        if al.is_reverse: strand = "-"
        i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand)
        
        for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75):
            print "\t".join(str(x) for x in [i,hit])
Example #32
0
def expression_dict_from_bam(alias_dict, gene_dict, exp_file_name):

    # Fetching expression
    exp_dict = dict()
    exp_file = Samfile(exp_file_name, "rb")
    for k in gene_dict.keys():
        geneVec = gene_dict[k]
        gene = geneVec[3]
        region = [geneVec[0], int(geneVec[1]), int(geneVec[2])]
        exp = fetch_counts(exp_file, region)
        exp_dict[gene] = float(exp) / (region[2] - region[1])
    exp_file.close()

    # Returning objects
    return exp_dict
Example #33
0
def annotate(context, bam_path, in_stream, sample, group, cutoff, extendby,
             prefix, threshold):
    """Annotate intervals in a BED-file/stream.

  \b
  BAM_PATH: Path to BAM-file
  IN_STREAM: Chanjo-style BED-file with interval definitions
  """
    # connect to the BAM file
    with Samfile(bam_path) as bam:
        # user defined sample id or randomly generated
        sample = (sample or get_sample_id(bam.header) or id_generator())

    # step 1: metadata header
    metadata = dict(sample_id=sample,
                    group_id=group,
                    cutoff=cutoff,
                    coverage_source=path(bam_path).abspath(),
                    extension=extendby)
    click.echo("#%s" % json.dumps(metadata))

    # step 2: annotate list of intervals with coverage and completeness
    bed_lines = pipe(
        annotate_bed_stream(bed_stream=in_stream,
                            bam_path=bam_path,
                            cutoff=cutoff,
                            extension=extendby,
                            contig_prefix=prefix,
                            bp_threshold=threshold),
        map(serialize_interval(bed=True))  # stringify/bedify
    )

    # reduce/write the BED lines
    for bed_line in bed_lines:
        click.echo(bed_line)
Example #34
0
def bam_uniq(args):
    """
    * BAM file should be sorted in (tid, pos)
    * (qname, pos, is_unmapped, is_read_2, cigar) is checked
    * if multiple records exist, primary alignment is selected
    * scores are not changed
    """
    sam = Samfile(args.bam)

    # setup output
    if args.output.endswith('.bam'):
        mode = 'wb'
    else:
        mode = 'wh'
    out = pysam.Samfile(args.output, mode=mode, template=sam)
    it = sam  # TODO region

    def get_key(rec):
        return (rec.qname, rec.pos, rec.is_unmapped, rec.is_read2, rec.cigar)

    def get_best_rec(recs):
        for rec in recs:
            if not rec.is_secondary and not rec.is_supplementary:
                return rec
        return rec  # No primary alignments were found

    for (tid,
         pos), recs in groupby(it, lambda rec:
                               (rec.tid, rec.pos)):  # assume position sorted
        recs1 = sorted(recs, key=get_key)  # manual sort by key is needed
        for key, recs2 in groupby(recs1, get_key):
            rec = get_best_rec(recs2)
            out.write(rec)
Example #35
0
def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam',
                          fasta_fn='fixture/ref.fa'):
    # no read filters
    kwargs = {'chrom': 'Pf3D7_01_v3',
              'start': 0,
              'end': 2000,
              'one_based': False}
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
    # read filters
    kwargs['min_mapq'] = 1
    kwargs['no_dup'] = True
    expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs)
    compare_iterators(expected, actual)
Example #36
0
    def filter(self, infile, countfile):

        inbam = Samfile(infile, 'rb')

        count_labels = [
            'u', 'u-pf', 'u-pf-n',
            'u-pf-n-mm%d' % self.max_mismatches,
            'u-pf-n-mm%d-mito' % self.max_mismatches, 'mm', 'nm', 'qc-flagged',
            'umi-duplicate', 'umi-duplicate-nuclear', 'nuclear-align',
            'autosomal-align', 'paired-aligned', 'paired-nuclear-align',
            'paired-autosomal-align', 'all-aligned', 'all-mapq-filter'
        ]

        logging.debug(count_labels)
        self.counts = dict([(label, 0) for label in count_labels])

        self.chrcounts = defaultdict(int)
        self.mapqcounts = defaultdict(int)
        self.samflagcounts = defaultdict(int)
        self.readlengthcounts = defaultdict(int)

        for read in inbam:
            self.process_read(read, inbam)

        countout = open(countfile, 'a')

        self.write_dict(countout, self.counts)
        self.write_dict(countout, self.chrcounts)
        self.write_dict(countout, self.mapqcounts)
        self.write_dict(countout, self.samflagcounts)
        self.write_dict(countout, self.readlengthcounts)

        countout.close()
Example #37
0
def test_read_evidence_variant_matching_gatk_mini_bundle_extract():
    handle = Samfile(data_path("gatk_mini_bundle_extract.bam"))

    loci = [
        Locus.from_inclusive_coordinates("20", 10008951),  # 0
        Locus.from_inclusive_coordinates("20", 10009053),  # 1
        Locus.from_inclusive_coordinates("20", 10009053, 10009054),  # 2
        Locus.from_inclusive_coordinates("20", 10006822),  # 3
        Locus.from_inclusive_coordinates("20", 10006822, 10006823),  # 4
    ]
    evidence = PileupCollection.from_bam(handle, loci)

    eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1),
                                                             ('C', 4)])
    eq_(
        evidence.filter(drop_duplicates=True).match_summary(
            Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3),
                                                             ('C', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3),
                                                              ('CC', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)])
    eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3),
                                                             ('', 0)])
    eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)])
    eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2),
                                                             ('', 6)])
    eq_(
        evidence.match_summary(
            Variant(loci[4], "AC", ""),
            lambda e: e.read_attributes().mapping_quality.mean()),
        [('AC', 60.0), ('', 65.0)])
Example #38
0
def get_bc_signal(arguments):
    (mpbs_region, reads_file, organism, window_size, forward_shift,
     reverse_shift, bias_table) = arguments

    bam = Samfile(reads_file, "rb")
    genome_data = GenomeData(organism)
    signal = np.zeros(window_size)
    # Fetch bias corrected signal
    for region in mpbs_region:
        mid = (region.final + region.initial) // 2
        p1 = mid - window_size // 2
        p2 = mid + window_size // 2

        if p1 <= 0:
            continue
        # Fetch raw signal
        _signal = bias_correction(chrom=region.chrom,
                                  start=p1,
                                  end=p2,
                                  bam=bam,
                                  bias_table=bias_table,
                                  genome_file_name=genome_data.get_genome(),
                                  forward_shift=forward_shift,
                                  reverse_shift=reverse_shift)
        if len(_signal) != window_size:
            continue

        # smooth the signal
        signal = np.add(signal, np.array(_signal))

    return signal
Example #39
0
def mc_path_call6(args):
    with open(args.gref) as fp:
        fasta = Fasta(fp)
        contigs = {contig.name: contig.seq.upper() for contig in fasta.contigs}

    with Samfile(args.bam) as sam:
        smb = SamModelBuilder2(sam, regions=args.regions, min_second_bases=args.min_second_bases, contigs=contigs)

    if not args.table:
        hap_depths = {ref.name: args.hap_depth for ref in smb.model.refs}
        ploidies = {ref.name: args.copy_number for ref in smb.model.refs}
    else:
        tab = pd.read_table(args.table)
        hap_depths = dict(zip(tab.contig, tab.hap_depth))
        ploidies = dict(zip(tab.contig, tab.copy_number))
    show_model(smb.model, verbosity=args.verbose)
    from .infer6 import InferModel
    im = InferModel(smb, hap_depths=hap_depths, ploidies=ploidies)
    #im.init_best_het()
    #im.init()
    #im.run_through_variants()
    if args.no_phase:
        im.run_genotyping()
    else:
        im.run_haplotyping()
        start_var = None
        #start_var = smb.model.refs[0].get_variant(1203)
        #im.run_phase_variants(start_var=start_var)
    im.show_variant_info(show_all_variant=True)
Example #40
0
 def __init__(self, file_name):
     """ 
     Initializes GenomicSignal.
     """
     self.file_name = file_name
     self.sg_coefs = None
     self.bam = Samfile(file_name, "rb")
Example #41
0
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
        os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
        os.remove(output_fname)
Example #42
0
def split_reads(reads, ref_reads, alt_reads):
    read_results = Counter()

    ref_bam = Samfile(ref_reads, 'wb', template=reads)
    alt_bam = Samfile(alt_reads, 'wb', template=reads)

    prev_phase = None
    prev_read = None
    prev_qname = None
    test = 0

    for read in reads.fetch(until_eof=True):
        test += 1
        chrom = read.reference_name
        snps_on_chrom = snp_dict[chrom]
        phase = get_phase(read, snps_on_chrom)
        read_qname = read.qname
        if read_qname == prev_qname:
            read_results["tot_read"]+=1
            phase_set = set([phase, prev_phase])
            phase_set.discard(None)
            if len(phase_set) == 1:
                read_phase = phase_set.pop()
                if read_phase == -1:
                    ref_bam.write(read)
                    ref_bam.write(prev_read)
                    read_results["ref_read"]+=1
                elif read_phase == 1:
                    alt_bam.write(read)
                    alt_bam.write(prev_read)
                    read_results["alt_read"]+=1
                elif read_phase == 0:
                    read_results['misphased_read']+=1
            elif len(phase_set)==0:
                read_results["no_snps_read"]+=1
            else:
                read_results['misphased_read']+=1

        prev_read = read
        prev_phase = phase
        prev_qname = read_qname

    return(read_results)
Example #43
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), "subset")
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random() ** i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + "_min"
        else:
            outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[: int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, "->", outdir
        stdout.flush()
        of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
Example #44
0
def _bowtie2_filter(fnam, fastq_path, unmap_out, map_out):
    """
    Divides reads in a map file in two categories: uniquely mapped, and not.
    Writes them in two files

    """
    try:
        fhandler = Samfile(fnam)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % fnam)
    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    unmap_out = open(unmap_out, 'w')
    map_out   = open(map_out, 'w')
    fastq_in  = open(fastq_path , 'r')
    for line in fhandler:
        line_in = fastq_in.readline()
        if line.is_unmapped or line.mapq < 4:
            read = '%s\t%s\t%s\t%s\t%s\n' % (
                line_in.split('\t', 1)[0].rstrip('\n')[1:],
                line.seq, line.qual, '-', '-'
                )
            unmap_out.write(read)
        else:
            read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % (
                line.qname, line.seq, line.qual, '1',
                crm_dict[line.tid],
                '-' if line.is_reverse else '+', line.pos + 1, len(line.seq))
            map_out.write(read)
        for _ in range(3):
            fastq_in.readline()
    unmap_out.close()
    map_out.close()
    fastq_in.close()
Example #45
0
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print = []
    for aln in samfile.fetch(until_eof = True):
        if pair_is_aligned(aln, ref_ids):
            if args.read_pair == 1 and aln.is_read1:
                reads_to_print.append(aln)
            elif args.read_pair == 2 and aln.is_read2:
                reads_to_print.append(aln)
            elif args.read_pair == 0:
                reads_to_print.append(aln)
        if len(reads_to_print) >= 10000:
            # Flush the reads collected
            print_reads(reads_to_print)
            reads_to_print = []

    print_reads(reads_to_print)
Example #46
0
 def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
     startTime=Helper.getTime()
     minDistance=int(minDistance)
     counter=0;j=0  
     num_lines = len(self.variantDict)
     Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
     
     bamFile = Samfile(bamFile, "rb")
     
     for varKey in self.variantDict.keys():
         variant = self.variantDict[varKey]
         
         counter+=1
         if counter%10000==0:
             Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
         
         keepSNP=False
         varPos=variant.position-1
         iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
         #walks up the region wich overlap this position
         for x in iter:
             if x.pos == varPos:
                 for pileupread in x.pileups: #walk through the single reads
                     if not pileupread.is_del and not pileupread.is_refskip:
                         distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
                         if distance >= minDistance:
                             #check readBase and Base Quality
                             if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
                             #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                 keepSNP=True
                                 
         if keepSNP==False:
             j+=1
             del self.variantDict[varKey]
     
     Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") 
     Helper.printTimeDiff(startTime, self.logFile, self.textField)
     bamFile.close()
Example #47
0
    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = Samfile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, option)
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print_1 = []
    reads_to_print_2 = []
    reads_to_print_u = []
    for aln in samfile.fetch(until_eof = True):
        if aln.tid in ref_ids: # This read is aligned
            if aln.rnext in ref_ids: # The mate is also aligned
                if aln.is_read1:
                    reads_to_print_1.append(aln)
                    reads_to_print_1 = flush_reads(reads_to_print_1, args.R1)
                elif aln.is_read2:
                    reads_to_print_2.append(aln)
                    reads_to_print_2 = flush_reads(reads_to_print_2, args.R2)
            else:
                reads_to_print_u.append(aln)
                reads_to_print_u = flush_reads(reads_to_print_u, args.u)

    print_reads(reads_to_print_1, args.R1)
    print_reads(reads_to_print_2, args.R2)
    print_reads(reads_to_print_u, args.u)
Example #49
0
 def __init__(self, file_name):
     """ 
     Initializes GenomicSignal.
     """
     self.file_name = file_name
     self.bam = None
     self.bw = None
     self.sg_coefs = None
     self.is_bam = False
     self.is_bw = False
     if(self.file_name.split(".")[-1].upper() == "BAM"):
         self.is_bam = True
         self.bam = Samfile(file_name,"rb")
     elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"):
         self.is_bw = True
         self.bw = BigWigFile(file_name)
     else: pass # TODO ERROR
Example #50
0
    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = Samfile(fname, "rb")
        # Check for sortedness, index.
        # There doesn't seem to be a "public" way to do this right
        # now, but that's fine because we're going to have to rewrite
        # it all anyway once the pysam rewrite lands.
        if not self.peer._hasIndex:
            raise ValueError, "Specified bam file lacks a bam index---required for this API"

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            self._loadReferenceFasta(referenceFastaFname)
Example #51
0
biasTableFFile.close()

biasTableR = dict()
biasTableRFile = open(biasTableRFileName,"r")
for line in biasTableRFile:
  ll = line.strip().split("\t")
  biasTableR[ll[0]] = float(ll[1])
biasTableRFile.close()

#################################################################################################
# EVALUATING PROTECTION
#################################################################################################

# Initialization
protectDict = dict()
bam = Samfile(dnaseFileName,"rb")

# Iterating on MPBSs
for mpbs in mpbsList:

  # Fetching MPBSs
  grepFileName = outputFileName+"_grepmpbs.bed"
  to_remove.append(grepFileName)
  os.system("grep \"\t\""+mpbs+"\"\t\" "+mpbsFileNameBed+" | cut -f 1,2,3 | sort -k1,1 -k2,2n > "+grepFileName)

  # Intersect with footprints
  intFileName = outputFileName+"_fpint.bed"
  to_remove.append(intFileName)
  os.system("intersectBed -a "+grepFileName+" -b "+fpFileNameBed+" -wa -u > "+intFileName)

  # Iterating on MPBSs
Example #52
0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print 'Searching and mapping RE sites to the reference genome'
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, str):
        f_names1 = [f_names1]
    if isinstance(f_names2, str):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print 'Loading read' + str(read + 1)
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print 'WARNING: file "%s" not found' % fnam
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' in dict(x)
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print 'loading SAM file from %s: %s' % (mapper, fnam)
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos / frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]
        
        if verbose:
            print 'Getting Multiple contacts'
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = tmp_reads_fh.next()
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                multis[read] += 1
                prev_read =  prev_read.strip() + '|||' + read_line
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis
Example #53
0
class _BamReaderBase(ReaderBase):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames   = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s    = [r["M5"] for r in refRecords]
        refIds = map(self.peer.gettid, refNames)
        nRefs = len(refRecords)

        if nRefs > 0:
            self._referenceInfoTable = np.rec.fromrecords(zip(
                refIds,
                refIds,
                refNames,
                refNames,
                refLengths,
                refMD5s,
                np.zeros(nRefs, dtype=np.uint32),
                np.zeros(nRefs, dtype=np.uint32)),
                dtype=[('ID', '<i8'), ('RefInfoID', '<i8'),
                       ('Name', 'O'), ('FullName', 'O'),
                       ('Length', '<i8'), ('MD5', 'O'),
                       ('StartRow', '<u4'), ('EndRow', '<u4')])
            self._referenceDict = {}
            self._referenceDict.update(zip(refIds, self._referenceInfoTable))
            self._referenceDict.update(zip(refNames, self._referenceInfoTable))
        else:
            self._referenceInfoTable = None
            self._referenceDict = None

    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            # TODO(dalexander): need FRAMERATEHZ in RG::DS!
            #rgFrameRate = ds["FRAMERATEHZ"]
            rgFrameRate = 75.0
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float)])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        self._pulseFeaturesAvailable = pulseFeaturesInAll_


    def _loadProgramInfo(self):
        pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
                      for pg in self.peer.header.get("PG", []) ]

        if len(pgRecords) > 0:
            self._programTable = np.rec.fromrecords(
                pgRecords,
                dtype=[("ID"     ,     "O"),
                       ("Version",     "O"),
                       ("CommandLine", "O")])
        else:
            self._programTable = None

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, len(c)) for c in ft)
        bamIdsAndLens   = set((c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def _checkFileCompatibility(self):
        # Verify that this is a "pacbio" BAM file of version at least
        # 3.0b3
        try:
            checkedVersion = self.version
        except:
            raise IncompatibleFile(
                "This BAM file is incompatible with this API " +
                "(only PacBio BAM files version >= 3.0b3 are supported)")

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = Samfile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    @property
    def isUnmapped(self):
        return not(self.isMapped)

    @property
    def isMapped(self):
        return len(self.peer.header["SQ"]) > 0

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroupInfo(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    @property
    def version(self):
        return self.peer.header["HD"]["pb"]

    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return self.peer.header["HD"]["SO"] == "coordinate"

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    @requiresBai
    def __len__(self):
        return self.peer.mapped + self.peer.unmapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
Example #54
0
def main(argv=None):
    """Main script."""
    ##########################
    # COMMAND-LINE ARGUMENTS #
    ##########################

    # Get myself
    program_name = sys.argv[0]

    if not argv:
        argv = sys.argv[1:]

    # Get the cluster type used to control arguments
    cluster_type = cluster.get_cluster_environment()

    parser  = argparse.ArgumentParser(
        description=__doc__, add_help=False,
        epilog=EPILOG, formatter_class=run.CustomFormatter)

    req = parser.add_argument_group('Required arguments')
    req.add_argument('-m', '--mode',
                     help='Operation mode', choices=['single', 'multi'],
                     required=True, metavar='mode')
    req.add_argument('-s', '--snps',
                     help='SNP BED file', required=True, metavar='<BED>')
    req.add_argument('-r', '--reads',
                     help='Mapped reads file [sam or bam]',
                     required=True, metavar='<[S/B]AM>')

    uni = parser.add_argument_group('Universal optional arguments')
    uni.add_argument('-p', '--prefix',
                     help='Prefix for temp files and output', default='TEST',
                     metavar='')
    uni.add_argument('-b', '--bam', action='store_true', dest='bam',
                     help='Mapped read file type is bam (auto-detected if *.bam)')
    uni.add_argument('-n', '--noclean', action='store_true',
                     help='Do not delete intermediate files (for debuging)')
    uni.add_argument('-R', '--random-seed', default=None, type=int,
                     help='Set the state of the randomizer (for testing)')
    uni.add_argument('-h', '--help', action='help',
                     help='show this help message and exit')

    mult = parser.add_argument_group('Multi(plex) mode arguments')
    mult.add_argument('-j', '--jobs', type=int,
                      help='Divide into # of jobs', default=100, metavar='')
    if cluster_type == 'slurm' or cluster_type == 'torque':
        mult.add_argument('-w', '--walltime',
                          help='Walltime for each job', default='3:00:00',
                          metavar='')
        mult.add_argument('-k', '--mem', dest='memory', metavar='',
                          help='Memory for each job', default='5000MB')
        mult.add_argument('--queue',
                          help='Queue to submit jobs to', default='batch',
                          metavar='')
        mult.add_argument('--cluster', choices=['torque', 'slurm', 'normal'],
                          help='Which cluster to use, normal uses threads ' +
                          'on this machine', default=cluster_type)
    mult.add_argument('--threads', type=int, metavar='', default=cpu_count(),
                      help='Max number of threads to run at a time ' +
                      '(normal mode only).')

    single = parser.add_argument_group('Single mode arguments')
    single.add_argument('-f', '--suffix', default='', metavar='',
                        help='Suffix for multiplexing [set automatically]')

    logging = parser.add_argument_group('Logging options')
    logging.add_argument('-q', '--quiet', action='store_true',
                         help="Quiet mode, only prints warnings.")
    logging.add_argument('-v', '--verbose', action='store_true',
                         help="Verbose mode, prints debug info too.")
    logging.add_argument('--logfile',
                         help='Logfile to write messages too, default is ' +
                         'STDERR')

    args = parser.parse_args()
    if args.random_seed is not None:
        random.seed(args.random_seed)
        print("Seed: ", args.random_seed, random.getstate()[1][:10])

    ###########################################################################
    #                            File Preparations                            #
    ###########################################################################

    # Take care of logging
    if args.logfile:
        logme.LOGFILE = args.logfile
    if args.quiet:
        logme.MIN_LEVEL = 'warn'
    elif args.verbose:
        logme.MIN_LEVEL = 'debug'

    # Initialize variables
    prefix = args.prefix + '_'

    # Make sure we can run ourselves
    if not run.is_exe(program_name):
        program_name = run.which(parser.prog)

    # Set the cluster type if we are in multi mode
    if args.mode == 'multi' and (cluster_type == 'slurm'
                                 or cluster_type == 'torque'):
        cluster.QUEUE = args.cluster

    # Check if the read file is sam or bam
    file_check = args.reads.split('.')
    file_check[-1] = file_check[-1].lower()
    sam_path, sam_file = os.path.split(args.reads)

    if args.reads.endswith('bam') or args.bam:
        mode = 'rb'
    else:
        mode = 'r'

    ##################
    # MULTIPLEX MODE #
    ##################

    # If we're running in multiplex mode
    if args.mode == 'multi':
        logme.log('Splitting sam file {} into {} files.'.format(sam_file,
                                                                args.jobs))
        reads_files = split_samfile(os.path.join(sam_path, sam_file),
                                    args.jobs, prefix)
        logme.log('Splitting complete.')

        # Create PBS scripts and submit jobs to the cluster
        subnoclean = ' --noclean' if args.noclean else ''
        logme.log('Submitting split files to cluster')
        jobs = []  # Hold job info for later checking
        for reads_file in reads_files:
            suffix = reads_file[-4:]

            command = ("python2 " + program_name + " --mode single --snps " +
                       args.snps + " --reads " + reads_file + " --suffix " +
                       suffix + " --prefix " + args.prefix + subnoclean +
                       ' --bam')

            if cluster_type == 'normal':
                jobs.append(cluster.submit(command, name=prefix + suffix,
                                           threads=args.threads))
            else:
                jobs.append(cluster.submit(command, name=prefix + suffix,
                                           time=args.walltime, cores=1,
                                           mem=args.memory,
                                           partition=args.queue))
            sleep(2)    # Pause for two seconds to make sure job is submitted

        # Now wait and check for all jobs to complete every so long
        logme.log('Submission done, waiting for jobs to complete.')

        # First wait for jobs in queue to complete
        cluster.wait(jobs)
        sleep(1)

        # Next, check if any jobs failed
        failed = []
        for i in range(1, args.jobs+1):
            suffix = str(i).zfill(4)
            if not os.path.isfile(prefix + suffix + '_done'):
                failed.append(prefix + suffix)

        # If any jobs failed, terminate
        if failed:
            logme.log('Some jobs failed!', 'critical')
            return -1

        logme.log('Jobs completed.')
        # Remove 'done' files in case we want to run again.
        os.system('rm {prefix}*_done'.format(prefix=prefix))

        # Once the jobs are done, concatenate all of the counts into one file.
        # Initialize dictionaries

        tot_pos_counts = {}
        tot_neg_counts = {}
        tot_tot_counts = {}
        tot_sum_pos = {}
        tot_sum_neg = {}

        for i in range(1, args.jobs+1):
            suffix = str(i).zfill(4)
            in_counts = prefix + 'SNP_COUNTS_' + suffix

            # Parse the line to add it to the total file
            with run.open_zipped(in_counts, 'r') as in_counts:
                for line in in_counts:
                    line = line.rstrip('\n')
                    line_t = line.split('\t')

                    if 'CHR' in line:
                        continue

                    pos = line_t[0] + '|' + line_t[1]

                    pos_split = line_t[2].split('|')
                    neg_split = line_t[3].split('|')

                    if pos in tot_pos_counts or pos in tot_neg_counts or pos in tot_tot_counts:
                        for j in range(len(pos_split)):
                            tot_pos_counts[pos][j] += int(pos_split[j])
                            tot_neg_counts[pos][j] += int(neg_split[j])
                        tot_sum_pos[pos] += int(line_t[4])
                        tot_sum_neg[pos] += int(line_t[5])
                        tot_tot_counts[pos] += int(line_t[6])

                    else:
                        tot_pos_counts[pos] = [0, 0, 0, 0]
                        tot_neg_counts[pos] = [0, 0, 0, 0]
                        tot_tot_counts[pos] = 0
                        tot_sum_pos[pos] = 0
                        tot_sum_neg[pos] = 0
                        for j in range(len(pos_split)):
                            tot_pos_counts[pos][j] += int(pos_split[j])
                            tot_neg_counts[pos][j] += int(neg_split[j])
                        tot_sum_pos[pos] += int(line_t[4])
                        tot_sum_neg[pos] += int(line_t[5])
                        tot_tot_counts[pos] += int(line_t[6])

        # Write out the final concatenated file
        with run.open_zipped(prefix + 'SNP_COUNTS.txt', 'w') as final_counts:
            final_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' +
                               'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n')

            keys = sorted(tot_pos_counts.keys())

            for key in keys:
                pos = key.split('|')
                pos_fix = [str(x) for x in tot_pos_counts[key]]
                neg_fix = [str(x) for x in tot_neg_counts[key]]
                pos_out = '|'.join(pos_fix)
                neg_out = '|'.join(neg_fix)
                final_counts.write(str(pos[0]) + '\t' + str(pos[1]) + '\t' +
                                   pos_out + '\t' + neg_out + '\t' +
                                   str(tot_sum_pos[key]) + '\t' +
                                   str(tot_sum_neg[key]) + '\t' +
                                   str(tot_tot_counts[key]) + '\n')

        # Sort the file numerically
        os.system('sort -k1,2 -n ' + prefix + 'SNP_COUNTS.txt ' + ' -o ' +
                  prefix + 'SNP_COUNTS.txt')

        # Clean up intermediate files.
        if args.noclean is False:
            cluster.clean()
            os.system('rm {prefix}*COUNTS_* {prefix}*split_sam_*'.format(
                prefix=prefix))

    ###############
    # SINGLE MODE #
    ###############

    # If we're running in single mode (each job submitted by multiplex mode
    # will be running in single mode)
    elif args.mode == 'single':

        # First read in the information on the SNPs that we're interested in.
        snps = {}    # Initialize a dictionary of SNP positions

        with run.open_zipped(args.snps) as snp_file:
            for line in snp_file:
                line = line.rstrip('\n')
                line_t = line.split('\t')

                pos = chrom_to_num(line_t[0]) + '|' + str(line_t[2])
                snps[pos] = line_t[3]

        # This is the dictionary of potential SNPs for each read.
        potsnp_dict = {}

        # Now parse the SAM file to extract only reads overlapping SNPs.
        in_sam     = Samfile(args.reads, mode)
        references = in_sam.references  # Faster to make a copy of references.

        # Trackers to count how many reads are lost at each step
        indel_skip = 0
        nosnp_skip = 0
        count      = 0
        snp_count  = 0
        ryo_filter = 0

        for line in in_sam:
            count += 1

            # Skip lines that overlap indels OR don't match Ns
            cigarstring = line.cigarstring

            if 'D' in cigarstring or 'I' in cigarstring:
                indel_skip += 1
                continue

            # Split the tags to find the MD tag:
            tags = line.tags
            for tagname, tagval in tags:
                if tagname == 'MD' and 'N' in tagval:
                    # Remember that, for now, we're not allowing reads that
                    # overlap insertions/deletions.

                    chrom = references[line.rname]
                    pos   = line.pos
                    read  = line.seq

                    # We're assuming
                    # correct mapping such that FIRST MATES on the NEGATIVE
                    # STRAND are NEGATIVE, while SECOND MATES on the NEGATIVE
                    # STRAND are POSITIVE.

                    if line.is_reverse:
                        orientation = '-'
                    else:
                        orientation = '+'

                    # Parse the CIGAR string
                    cigar_types, cigar_vals = split_CIGAR(cigarstring)

                    if cigar_types[0] == 'S':
                        MD_start = int(cigar_vals[0])
                    else:
                        MD_start = 0

                    # Get the genomic positions corresponding to each base-pair
                    # of the read
                    read_genomic_positions = CIGAR_to_Genomic_Positions(
                        cigar_types, cigar_vals, line.pos+1)

                    # Get the tag data
                    MD_split = re.findall('\d+|\D+', tagval)

                    genome_start = 0

                    # The snp_pos dictionary will store the 1-base position
                    # => allele
                    snp_pos = {}
                    for i in MD_split:
                        if re.match('\^', i):
                            pass
                        elif i.isalpha():
                            if i == 'N':
                                snp_pos[read_genomic_positions[genome_start]] = read[MD_start]
                                MD_start += 1
                                genome_start += 1
                            else:
                                MD_start += 1
                                genome_start += 1
                        else:
                            MD_start += int(i)
                            genome_start += int(i)

                    for i in snp_pos:
                        snp_count += 1

                        # RYO: START EDIT - Implemented Filter
                        posVal = line.reference_name + '|' + str(i)
                        if posVal not in snps:
                            nosnp_skip += 1
                            continue
                        # RYO: END EDIT - Implmented Filter

                        snp = '{chr}|{i}\t{snp_pos}\t{orientation}'.format(
                            chr=chrom, i=i, snp_pos=snp_pos[i],
                            orientation=orientation)
                        if line.qname in potsnp_dict:
                            if snp not in potsnp_dict[line.qname]:
                                # RYO EDIT HERE - added conditional so that
                                # pairs of reads are not considered twice if
                                # they both overlap the same snp.
                                potsnp_dict[line.qname].append(snp)
                            else:
                                ryo_filter += 1
                        else:
                            potsnp_dict[line.qname] = []
                            potsnp_dict[line.qname].append(snp)

        in_sam.close()

        # Log all of the skipped reads
        logme.log('Total reads: {}'.format(count), 'debug')
        logme.log('Reads skipped for indels: {}'.format(indel_skip), 'debug')
        logme.log('Total SNPs checked: {}'.format(snp_count), 'debug')
        logme.log('SNPs not in SNP list: {}'.format(nosnp_skip), 'debug')
        logme.log('Ryo filter: {}'.format(ryo_filter), 'debug')

        # Initialize the counting dictionaries
        pos_counts = {}
        neg_counts = {}

        # Go through the potential SNP dictionary and choose one SNP at random
        # for those overlapping multiple SNPs
        if args.random_seed is not None:  # Dictionaries are unordered, so must sort for consistent random seed output.
            keys = sorted(list(potsnp_dict.keys()))
        else:  # Because sorting is slow, only do it if random seed is set, slowdown is about 0.1s per 1 million reads..
            keys = list(potsnp_dict.keys())
        for key in keys:
            snp = random.choice(potsnp_dict[key]).split('\t')

            if snp[0] in snps:
                if snp[0] in pos_counts or snp[0] in neg_counts:
                    if snp[2] == '+':
                        if snp[1] == 'A':
                            pos_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            pos_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            pos_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            pos_counts[snp[0]][3] += 1

                    elif snp[2] == '-':
                        if snp[1] == 'A':
                            neg_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            neg_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            neg_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            neg_counts[snp[0]][3] += 1

                else:
                    pos_counts[snp[0]] = [0, 0, 0, 0]
                    neg_counts[snp[0]] = [0, 0, 0, 0]
                    if snp[2] == '+':
                        if snp[1] == 'A':
                            pos_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            pos_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            pos_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            pos_counts[snp[0]][3] += 1

                    elif snp[2] == '-':
                        if snp[1] == 'A':
                            neg_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            neg_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            neg_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            neg_counts[snp[0]][3] += 1

        # Open the output file and write the SNP counts to it

        out_counts = prefix + 'SNP_COUNTS_' + args.suffix if args.suffix \
            else prefix + 'SNP_COUNTS.txt'

        with open(out_counts, 'w') as out_counts:
            # Write header
            out_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' +
                             'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n')

            # Sort SNP positions and write them
            keys = sorted(pos_counts.keys())

            for key in keys:
                pos = key.split('|')
                sum_pos = sum(pos_counts[key])
                sum_neg = sum(neg_counts[key])
                tot_sum = sum(pos_counts[key]) + sum(neg_counts[key])
                pos_fix = [str(x) for x in pos_counts[key]]
                neg_fix = [str(x) for x in neg_counts[key]]
                positive = '|'.join(pos_fix)
                negative = '|'.join(neg_fix)

                out_counts.write(pos[0] + '\t' + pos[1] + '\t' + positive +
                                 '\t' + negative + '\t' + str(sum_pos) + '\t' +
                                 str(sum_neg) + '\t' + str(tot_sum) + '\n')

        if args.suffix:
            os.system('touch ' + prefix + args.suffix + '_done')
Example #55
0
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
Example #56
0
def split_samfile(sam_file, splits, prefix='', path=''):
    """Take a sam file and split it splits number of times.

    :path:    Where to put the split files.
    :prefix:  A prefix for the outfile names.
    :returns: A tuple of job files.
    """
    # Determine how many reads will be in each split sam file.
    num_lines = count_reads(sam_file)
    num_reads = int(int(num_lines)/splits) + 1

    # Get rid of starting path
    sam_name = os.path.basename(sam_file)

    # Subset the SAM file into X number of jobs
    cnt      = 0
    currjob  = 1
    suffix   = '.split_sam_' + str(currjob).zfill(4)
    run_file = os.path.join(path, prefix + sam_name + suffix)
    rmode    = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
    wmode    = 'wb'

    # Actually split the file
    outfiles = [run_file]
    with Samfile(sam_file, rmode) as in_sam:
        sam_split = Samfile(run_file, wmode, template=in_sam)
        for line in in_sam:
            cnt += 1
            if cnt < num_reads:
                sam_split.write(line)
            elif cnt == num_reads:
                # Check if next line is mate-pair. If so, don't split.
                line2     = next(in_sam)
                currjob  += 1
                suffix    = '.split_sam_' + str(currjob).zfill(4)
                run_file  = os.path.join(path, prefix + sam_name + suffix)
                new_sam   = Samfile(run_file, wmode, template=in_sam)
                outfiles.append(run_file)

                if line.qname == line2.qname:
                    sam_split.write(line)
                    sam_split.write(line2)
                    sam_split.close()
                    cnt = 0
                else:
                    sam_split.write(line)
                    sam_split.close()
                    new_sam.write(line2)
                    cnt = 0
                sam_split = new_sam
        sam_split.close()
    return tuple(outfiles)
Example #57
0
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq,
              re_name, verbose=False, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`.

    """
    frags = map_re_sites(re_name, genome_seq, verbose=True)
    frag_chunk = kwargs.get('frag_chunk', 100000)

    fnames = f_names1, f_names2
    outfiles = out_file1, out_file2
    for read in range(2):
        if verbose:
            print 'Loading read' + str(read + 1)
        reads    = []
        for fnam in fnames[read]:
            if verbose:
                print 'loading file:', fnam
            try:
                fhandler = Samfile(fnam)
            except IOError:
                continue
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i).replace('chr', '')
                    i += 1
                except ValueError:
                    break
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if r.tags[1][1] != 1:
                    continue
                positive   = not r.is_reverse
                crm        = crm_dict[r.tid]
                len_seq    = len(r.seq)
                pos        = r.pos + (0 if positive else len_seq)
                try:
                    frag_piece = frags[crm][pos / frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx        = bisect(frag_piece, pos)
                prev_re    = frag_piece[idx - 1]
                next_re    = frag_piece[idx]
                name       = r.qname

                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
        reads_fh = open(outfiles[read], 'w')
        reads_fh.write(''.join(sorted(reads)))
        reads_fh.close()
    del(reads)
from progressbar import ProgressBar
import pandas as pd
try:
    import matplotlib.pyplot as mpl
    mpl.figure()
    mpl.plot([1,2])
    mpl.close()
    has_mpl = True
except:
    has_mpl = False

from pysam import Samfile

if __name__ == "__main__":
    window_size = int(10e3)
    mel = Samfile('analysis/on_mel/mel_gdna_bowtie2_dedup.bam')
    sim = Samfile('analysis/on_mel/sim_gdna_bowtie2_dedup.bam')

    posns = []
    mel_covs = []
    sim_covs = []
    prog = 0
    pbar = ProgressBar(max_value=sum(mel.lengths) + 1)
    for r, l in zip(mel.references, mel.lengths):
        for start in range(0, l, window_size):
            posns.append('{}_{}'.format(r, start))
            mel_covs.append(sum(sum(i) for i in
                                mel.count_coverage(r, start, start+window_size)))
            sim_covs.append(sum(sum(i) for i in
                                sim.count_coverage(r, start, start+window_size)))
        prog += l
Example #59
0
File: sam.py Project: arvin580/jcvi
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw
Example #60
0
class _BamReaderBase(object):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames   = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s    = [r["M5"] for r in refRecords]
        refIds = map(self.peer.gettid, refNames)
        nRefs = len(refRecords)

        self._referenceInfoTable = np.rec.fromrecords(zip(
            refIds,
            refIds,
            refNames,
            refNames,
            refLengths,
            refMD5s,
            np.zeros(nRefs, dtype=np.uint32),
            np.zeros(nRefs, dtype=np.uint32)),
            dtype=[('ID', '<i8'), ('RefInfoID', '<i8'),
                   ('Name', 'O'), ('FullName', 'O'),
                   ('Length', '<i8'), ('MD5', 'O'),
                   ('StartRow', '<u4'), ('EndRow', '<u4')])
        self._referenceDict = {}
        self._referenceDict.update(zip(refIds, self._referenceInfoTable))
        self._referenceDict.update(zip(refNames, self._referenceInfoTable))


    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys())
        for rg in rgs:
            # Regarding RG ID: BLASR currently outputs a hex digest of
            # 10 nibbles, instead of the 8 which would fit into a
            # 32-bit word.  So we truncate here for the purposes of
            # cross-referencing within this API and the PacBioBamIndex
            # API.  We do check for a collision below.
            rgID = int(rg["ID"][:8], 16)
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds["SOFTWAREVERSION"]
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem))
            pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys())

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.uint32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O")])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        self._pulseFeaturesAvailable = pulseFeaturesInAll_


    def _loadProgramInfo(self):
        # TODO: guarantee that these fields are nonoptional in our bams --- check with Marcus
        # TODO: are we interesting in the PP info?
        self._programTable = np.rec.fromrecords(
            [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
              for pg in self.peer.header["PG"] ],
            dtype=[("ID"     ,     "O"),
                   ("Version",     "O"),
                   ("CommandLine", "O")])

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, c.length) for c in ft)
        bamIdsAndLens   = set((c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = Samfile(fname, "rb")
        # Check for sortedness, index.
        # There doesn't seem to be a "public" way to do this right
        # now, but that's fine because we're going to have to rewrite
        # it all anyway once the pysam rewrite lands.
        if not self.peer._hasIndex:
            raise ValueError, "Specified bam file lacks a bam index---required for this API"

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    def attach(self, fofnFilename):
        self.basH5Collection = BasH5Collection(fofnFilename)

    @property
    def moviesAttached(self):
        return (self.basH5Collection is not None)

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    #TODO: change concept to readGroupTable in cmp.h5
    @property
    def movieInfoTable(self):
        raise Unimplemented()

    # TODO: change to read group accessor, this is semantically wrong now
    def movieInfo(self, movieId):
        raise Unimplemented()

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroup(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    #TODO: elide "Info" innames?
    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    #TODO: Marcus needs to put something in the spec for this
    @property
    def version(self):
        raise Unimplemented()

    #TODO: Marcus needs to put something in the spec for this
    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return True

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    # TODO: make this private in cmp.h5 reader
    def alignmentGroup(self, alnGroupId):
        raise UnavailableFeature("BAM has no HDF5 groups")

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    def __repr__(self):
        return "<%s for %s>" % (type(self).__name__, self.filename)


    def __len__(self):
        return self.peer.mapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()