Ejemplo n.º 1
0
def fetch_count_read (alignment_file, seq_name, start, end):
    """
    Count the number of read that are at least partly overlapping a specified chromosomic region
    @param alignment_file Path to a sam or a bam file
    @param seq_name Name of the sequence where read are to be aligned on
    @param start Start genomic coordinates of the area of alignment
    @param end End End genomic coordinates of the area of alignment
    """
    # Specific imports
    from pysam import AlignmentFile
    
    # Init a generator on the sam or bam file with pysam
    if alignment_file[-3:].lower() == "bam":
        al = AlignmentFile(alignment_file, "rb")
        
    elif alignment_file[-3:].lower() == "sam":
        al = AlignmentFile(alignment_file, "r")
    
    else:
        raise Exception("Wrong file format (sam or bam)") 
    
    # Count read aligned at least partly on the specified region
    n = 0
    for i in al.fetch(seq_name, start, end):
        n += 1
        
    al.close()
    
    return n
Ejemplo n.º 2
0
Archivo: umis.py Proyecto: vals/umis
def subset_bamfile(sam, barcodes):
    """
    Subset a SAM/BAM file, keeping only alignments from given
    cellular barcodes
    """
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)
    barcodes = set(barcode.strip() for barcode in barcodes)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            cb = match.group('CB')
            if cb in barcodes:
                out_file.write(aln)
Ejemplo n.º 3
0
def get_counts(args):
    """function to get fragment sizes

    """
    if args.out is None:
        args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])  
    chunks = ChunkList.read(args.bed)
    mat = np.zeros(len(chunks), dtype=np.int)
    bamHandle = AlignmentFile(args.bam)
    j = 0
    for chunk in chunks:
        for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper):
            if read.is_proper_pair and not read.is_reverse:
                if args.atac:
                    #get left position
                    l_pos = read.pos + 4
                    #get insert size
                    #correct by 8 base pairs to be inserion to insertion
                    ilen = abs(read.template_length) - 8
                else:
                    l_pos = read.pos
                    ilen = abs(read.template_length)
                r_pos = l_pos + ilen - 1
                if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)):
                    mat[j] += 1
        j += 1
    bamHandle.close()
    np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
Ejemplo n.º 4
0
Archivo: umis.py Proyecto: roryk/umis
def bamtag(sam, umi_only):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    if umi_only:
        parser_re = re.compile('.*:UMI_(?P<MB>.*)')
    else:
        parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    start_time = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    out_file = AlignmentFile("-", "wh", template=sam_file)

    track = sam_file.fetch(until_eof=True)

    for count, aln in enumerate(track):
        if not count % 100000:
            logger.info("Processed %d alignments.")

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if not umi_only:
            aln.tags += [('XC', match.group('CB'))]

        aln.tags += [('XR', match.group('MB'))]
        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
Ejemplo n.º 5
0
def umappedq2zero(bamdir):
    """
    Reads in a BAM file, setting the MAPQ value for an alignment segment
    to zero if it is unmapped.
    Opens up both infile and outfile and outputs these modified
    reads to outfile.
    """
    if not os.path.exists(bamdir):
        sys.stderr.write("Sorry, but the specified directory does not exist.")
        sys.exit(1)

    bamfiles = os.listdir(bamdir)
    bampaths = filter(lambda x: x.endswith(".bam"), bamfiles)
    bampaths = map(lambda x: os.path.join(bamdir, x), bampaths)
    for bam in bampaths:
        inbam = AlignmentFile(bam, "rb")
        # Template is specified to maintain the same header information.
        outbam = AlignmentFile("temp.bam", "wb", template=inbam)
        # Construct reads iterator using fetch.
        reads = inbam.fetch(until_eof=True)
        for read in reads:
            if read.is_unmapped == True:
                read.mapping_quality = 0
            outbam.write(read)  # Don't omit any reads!
        # Overwrite the original with the new file with MAPQs set to zero.
        os.rename("temp.bam", bam)
Ejemplo n.º 6
0
def _read_bam_frag(inbam, filter_exclude, all_bins, sections1, sections2,
                   rand_hash, resolution, tmpdir, region, start, end,
                   half=False, sum_columns=False):
    bamfile = AlignmentFile(inbam, 'rb')
    refs = bamfile.references
    bam_start = start - 2
    bam_start = max(0, bam_start)
    try:
        dico = {}
        for r in bamfile.fetch(region=region,
                               start=bam_start, end=end,  # coords starts at 0
                               multiple_iterators=True):
            if r.flag & filter_exclude:
                continue
            crm1 = r.reference_name
            pos1 = r.reference_start + 1
            crm2 = refs[r.mrnm]
            pos2 = r.mpos + 1
            try:
                pos1 = sections1[(crm1, pos1 / resolution)]
                pos2 = sections2[(crm2, pos2 / resolution)]
            except KeyError:
                continue  # not in the subset matrix we want
            crm = crm1 * (crm1 == crm2)
            try:
                dico[(crm, pos1, pos2)] += 1
            except KeyError:
                dico[(crm, pos1, pos2)] = 1
            # print '%-50s %5s %9s %5s %9s' % (r.query_name,
            #                                  crm1, r.reference_start + 1,
            #                                  crm2, r.mpos + 1)
        if half:
            for c, i, j in dico:
                if i < j:
                    del dico[(c, i, j)]
        out = open(os.path.join(tmpdir, '_tmp_%s' % (rand_hash),
                                '%s:%d-%d.tsv' % (region, start, end)), 'w')
        out.write(''.join('%s\t%d\t%d\t%d\n' % (c, a, b, v)
                          for (c, a, b), v in dico.iteritems()))
        out.close()
        if sum_columns:
            sumcol = {}
            cisprc = {}
            for (c, i, j), v in dico.iteritems():
                # out.write('%d\t%d\t%d\n' % (i, j, v))
                try:
                    sumcol[i] += v
                    cisprc[i][all_bins[i][0] == all_bins[j][0]] += v
                except KeyError:
                    sumcol[i]  = v
                    cisprc[i]  = [0, 0]
                    cisprc[i][all_bins[i][0] == all_bins[j][0]] += v
            return sumcol, cisprc
    except Exception, e:
        exc_type, exc_obj, exc_tb = exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print e
        print(exc_type, fname, exc_tb.tb_lineno)
Ejemplo n.º 7
0
def samfile(filename):
    """
    A context manager to open and close a SAM/BAM file.

    @param filename: A C{str} file name to open.
    """
    f = AlignmentFile(filename)
    yield f
    f.close()
Ejemplo n.º 8
0
 def test_downgrade_read_edges_binary(self):
     binary = os.path.join(BAM_BIN_DIR, "downgrade_bam_edge_qual")
     bam_fpath = os.path.join(TEST_DATA_DIR, "sample_rev.bam")
     with NamedTemporaryFile() as out_fhand:
         cmd = [binary, "-o", out_fhand.name, bam_fpath]
         check_call(cmd)
         sam = AlignmentFile(out_fhand.name)
         res = [0, 0]
         read = sam.next()
         assert list(read.query_qualities[:2]) == res
         assert read.get_tag("dl") == "8)5B"
         assert read.get_tag("dr") == "8?>>"
Ejemplo n.º 9
0
def extract_barcode(sam, barcode):

    parser_re = re.compile(".*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)")
    sam_file = AlignmentFile(sam, mode="r")
    filter_file = AlignmentFile("-", mode="wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        if aln.is_unmapped:
            continue
        match = parser_re.match(aln.qname)
        CB = match.group("CB")
        if CB == barcode:
            filter_file.write(aln)
Ejemplo n.º 10
0
def read_bam_frag_filter(inbam, filter_exclude, all_bins, sections,
                         resolution, outdir, extra_out,region, start, end):
    bamfile = AlignmentFile(inbam, 'rb')
    refs = bamfile.references
    try:
        dico = {}
        for r in bamfile.fetch(region=region,
                               start=start - (1 if start else 0), end=end,  # coords starts at 0
                               multiple_iterators=True):
            if r.flag & filter_exclude:
                continue
            crm1 = r.reference_name
            pos1 = r.reference_start + 1
            crm2 = refs[r.mrnm]
            pos2 = r.mpos + 1
            try:
                pos1 = sections[(crm1, pos1 / resolution)]
                pos2 = sections[(crm2, pos2 / resolution)]
            except KeyError:
                continue  # not in the subset matrix we want
            try:
                dico[(pos1, pos2)] += 1
            except KeyError:
                dico[(pos1, pos2)] = 1
        cisprc = {}
        for (i, j), v in dico.iteritems():
            if all_bins[i][0] == all_bins[j][0]:
                try:
                    cisprc[i][0] += v
                    cisprc[i][1] += v
                except KeyError:
                    cisprc[i] = [v, v]
            else:
                try:
                    cisprc[i][1] += v
                except KeyError:
                    cisprc[i] = [0, v]
        out = open(path.join(outdir,
                             'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out)), 'w')
        dump(dico, out, HIGHEST_PROTOCOL)
        out.close()
        out = open(path.join(outdir, 'tmp_bins_%s:%d-%d_%s.pickle' % (
            region, start, end, extra_out)), 'w')
        dump(cisprc, out, HIGHEST_PROTOCOL)
        out.close()
    except Exception, e:
        exc_type, exc_obj, exc_tb = exc_info()
        fname = path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print e
        print(exc_type, fname, exc_tb.tb_lineno)
Ejemplo n.º 11
0
    def test_calmd_bam(self):
        ref_fpath = os.path.join(TEST_DATA_DIR, "CUUC00007_TC01.fasta")
        bam_fpath = os.path.join(TEST_DATA_DIR, "sample.bam")
        orig_qual = AlignmentFile(bam_fpath).next().qual
        try:
            out_bam = NamedTemporaryFile()
            calmd_bam(bam_fpath, ref_fpath, out_bam.name)

            samfile = AlignmentFile(out_bam.name)
            calmd_qual = samfile.next().qual
            assert orig_qual != calmd_qual
            assert calmd_qual == "HHHHHHBHGGH!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        finally:
            if os.path.exists(out_bam.name):
                out_bam.close()
Ejemplo n.º 12
0
 def __init__(self, filename):
     self.samfile = AlignmentFile(filename)
     # self.referenceInsertions will be keyed by offset into the reference
     # sequence. The inserted bases would need to begin at this offset. The
     # value will be a Counter whose keys are the nucleotides proposed for
     # insertion, with a value indicating how many times the nucleotide was
     # proposed for insertion at that offset.
     self.referenceInsertions = defaultdict(Counter)
Ejemplo n.º 13
0
def gather_sv_data(options, collection):
	# Read regions of interest BED file
	regions = BedTool(options.region_file)

	# Read BAM file
	bamfile = AlignmentFile(options.bam_file, "rb")

	# Intersect regions
	for reg in regions:
		for read in bamfile.fetch(reg.chrom, reg.start, reg.end):
			#print read
			if read.query_name.endswith("2d"):
				collection[read.query_name] = []
			if read.query_name.startswith("ctg"):
				collection[read.query_name] = []
				#print read.reference_id, read.reference_start, read.reference_end
				#print read.query_name, read.query_alignment_start, read.query_alignment_end

	bamfile.close()
Ejemplo n.º 14
0
Archivo: umis.py Proyecto: vals/umis
def bamtag(sam):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    if is_python3():
        queryalignment = next(track)
    else:
        queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            aln.tags += [('XC', match.group('CB'))]
        if "molecular" in annotations:
            aln.tags += [('RX', match.group('MB'))]
        if "sample" in annotations:
            aln.tags += [('XS', match.group('SB'))]

        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
    logger.info("Processed %d alignments." % count)
Ejemplo n.º 15
0
    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)
    
    return bigg


 if __name__=="__main__":
 	import argparse
 	parser=argparse.ArgumentParser()
 	parser.add_argument("-b", "--bamfile",
                        help="the sorted and indexed bam file")
 	parser.add_argument("-o", "--out", default="bigg.bed",
 						help="the output file name")

 	args = parser.parse_args()

 	# make a file using the functions
	samfile=AlignmentFile(args.bamfile)

	fw=open(args.out, "w")

	for n, record in enumerate(samfile):
	    try:
	        bigg=sam_to_bigGenePred(record)
	        fw.write(bigg.to_str())
	        fw.write("\n")
	    except ValueError:
	        pass
	    #if n>100:
	        #break

	fw.close()
	samfile.close()
Ejemplo n.º 17
0
 def __init__(self, output, indexed_sequence_list, index_options):
     header = self.build_header(indexed_sequence_list, index_options)
     ensure_dir_exists(output)
     self.writer = AlignmentFile(output, 'wb', header=header)
     self.lock = Lock()
Ejemplo n.º 18
0
    def test_downngrade_read_edges(self):
        # With softclip
        bam_fpath = os.path.join(TEST_DATA_DIR, "sample.bam")
        sam = AlignmentFile(bam_fpath)

        aligned_read = sam.next()
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        res = [
            9,
            9,
            9,
            9,
            9,
            9,
            3,
            9,
            8,
            8,
            9,
            9,
            9,
            9,
            9,
            39,
            39,
            39,
            38,
            38,
            36,
            33,
            36,
            38,
            36,
            38,
            38,
            38,
            38,
            39,
            39,
            38,
            38,
            38,
            9,
            9,
            9,
            9,
        ]
        assert list(aligned_read.query_qualities) == res

        # without softclip
        sam = AlignmentFile(os.path.join(TEST_DATA_DIR, "seqs.bam"))

        aligned_read = sam.next()
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        expected = [
            11,
            13,
            11,
            11,
            37,
            43,
            43,
            46,
            46,
            57,
            57,
            48,
            57,
            57,
            42,
            41,
            32,
            35,
            38,
            38,
            38,
            38,
            41,
            41,
            39,
            37,
            37,
            44,
            42,
            48,
            47,
            57,
            47,
            47,
            48,
            47,
            57,
            57,
            54,
            48,
            57,
            48,
            54,
            50,
            50,
            50,
            50,
            50,
            57,
            59,
            54,
            54,
            54,
            57,
            57,
            59,
            57,
            52,
            52,
            52,
            52,
            57,
            57,
            57,
            57,
            52,
            52,
            52,
            52,
            29,
            27,
            27,
            22,
        ]

        assert list(aligned_read.query_qualities) == expected

        # reverse
        # rev seqs (sam specification puts all the alignment query
        # forward(cigar, seq, qual, ...). Reverse is inly noted in the flag
        bam_fpath = os.path.join(TEST_DATA_DIR, "sample_rev.bam")
        sam = AlignmentFile(bam_fpath)

        aligned_read = sam.next()
        aligned_read = sam.next()
        aligned_read = sam.next()
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        res = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0]
        assert list(aligned_read.query_qualities[:14]) == res
Ejemplo n.º 19
0
Archivo: umis.py Proyecto: roryk/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        sam_mode = 'r' if sam.endswith(".sam") else 'rb'
        sam_file = AlignmentFile(sam, mode=sam_mode)
        track = sam_file.fetch(until_eof=True)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    for i, aln in enumerate(track):
        count += 1
        if not count % 100000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info("%d were filtered for not matching known barcodes."
                            % nomatchcb)

        if aln.is_unmapped:
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)
        kept += 1

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(unicode(line), "utf-8")

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
Ejemplo n.º 20
0
Archivo: umis.py Proyecto: vals/umis
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, 
                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
    ''' Count up evidence for tagged molecules, this implementation assumes the
    alignment file is coordinate sorted
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    if sam.endswith(".sam"):
        logger.error("To use the fasttagcount subcommand, the alignment file must be a "
                     "coordinate sorted, indexed BAM file.")
        sys.exit(1)

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
    bare_evidence = collections.defaultdict(float)
    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    transcript_map = collections.defaultdict(set)
    sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]]
    if gene_map:
        for transcript, gene in gene_map.items():
            if transcript in sam_transcripts:
                transcript_map[gene].add(transcript)
    else:
        for transcript in sam_transcripts:
            transcript_map[transcript].add(transcript)
    missing_transcripts = set()
    alignments_processed = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    current_transcript = None
    count_this_read = True
    transcripts_processed = 0
    genes_processed = 0
    cells = list(cb_hist.index)
    targets_seen = set()

    if umi_matrix:
        bare_evidence_handle = open(umi_matrix, "w")
        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")

    with open(out, "w") as out_handle:
        out_handle.write(",".join(["gene"] + cells) + "\n")
        for gene, transcripts in transcript_map.items():
            for transcript in transcripts:
                for aln in sam_file.fetch(transcript):
                    alignments_processed += 1

                    if aln.is_unmapped:
                        unmapped += 1
                        continue

                    if gene_tags and not aln.has_tag('GX'):
                        unmapped += 1
                        continue

                    if aln.qname != current_read:
                        current_read = aln.qname
                        if subsample and i not in index_filter:
                            count_this_read = False
                            continue
                        else:
                            count_this_read = True
                    else:
                        if not count_this_read:
                            continue

                    if parse_tags:
                        CB = aln.get_tag('CR')
                    else:
                        match = parser_re.match(aln.qname)
                        CB = match.group('CB')

                    if filter_cb:
                        if CB not in cb_hist.index:
                            nomatchcb += 1
                            continue

                    if parse_tags:
                        MB = aln.get_tag('UM')
                    else:
                        MB = match.group('MB')

                    if gene_tags:
                        target_name = aln.get_tag('GX').split(',')[0]
                    else:
                        txid = sam_file.getrname(aln.reference_id)
                        if gene_map:
                            if txid in gene_map:
                                target_name = gene_map[txid]
                            else:
                                missing_transcripts.add(txid)
                                continue
                        else:
                            target_name = txid
                    targets_seen.add(target_name)

                    # Scale evidence by number of hits
                    evidence[CB][MB] += weigh_evidence(aln.tags)
                    bare_evidence[CB] += weigh_evidence(aln.tags)
                    kept += 1
                transcripts_processed += 1
                if not transcripts_processed % 1000:
                    logger.info("%d genes processed." % genes_processed)
                    logger.info("%d transcripts processed." % transcripts_processed)
                    logger.info("%d alignments processed." % alignments_processed)

            earray = []
            for cell in cells:
                umis = [1 for _, v in evidence[cell].items() if v >= minevidence]
                earray.append(str(sum(umis)))
            out_handle.write(",".join([gene] + earray) + "\n")
            earray = []
            if umi_matrix:
                for cell in cells:
                    earray.append(str(int(bare_evidence[cell])))
                bare_evidence_handle.write(",".join([gene] + earray) + "\n")

            evidence = collections.defaultdict(lambda: collections.defaultdict(int))
            bare_evidence = collections.defaultdict(int)
            genes_processed += 1

    if umi_matrix:
        bare_evidence_handle.close()

    # fill dataframe with missing values, sort and output
    df = pd.read_csv(out, index_col=0, header=0)
    targets = pd.Series(index=set(transcript_map.keys()))
    targets = targets.sort_index()
    df = df.reindex(targets.index.values, fill_value=0)
    df = df.sort_index()
    df.to_csv(out)

    if umi_matrix:
        df = pd.read_csv(umi_matrix, index_col=0, header=0)
        df = df.reindex(targets.index.values, fill_value=0)
        df = df.sort_index()
        df.to_csv(umi_matrix)
Ejemplo n.º 21
0
def samfile_from_args(args):
    return AlignmentFile(args.bam)
Ejemplo n.º 22
0
    def analyzeAlignment(self, alignmentOutputDirectory):
        print ('\nStep 2.) Parse the alignment and create a new consensus sequence.')
        
        # Load up the Alignment Reference file, we'll need it.
        alignmentReferenceFileName = join(alignmentOutputDirectory,'AlignmentReference.fasta')
        alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0]
        
        # Count the reads in the input file
        totalReadCount = len(list(parse(self.readInput, self.readInputFormat)))
        #self.readInputFormat
        #self.readInput
                
        # We generate a new consensus sequence from the alignment results.
        newConsensusSequence = ""
        
        # Open the bam file
        bamfile = AlignmentFile(join(alignmentOutputDirectory,'alignment.bam'), 'rb')  
        
        # Open alignment analysis text file
        alignmentSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AlignmentSummary.csv')) 
        alignmentSummaryFile.write('Ref_Position,Ref_Base,Reference_Adjustment,Aligned_Count,Unaligned_Count,Match_Count,Mismatch_Count,In_Count,Del_Count,A_Count,G_Count,C_Count,T_Count\n')
        
        # A smaller log. I will provide human-readable descriptions of the
        # bases that were adjusted in the new consensus sequence.
        # TODO: Provide surrounding sequence as well, maybe it's a repeat region....
        # Acutally NAH, I want to just put it in the wrangler log. 
        #adjustedBasesSummaryFile = createOutputFile(join(alignmentOutputDirectory,'AdjustedBases.txt')) 
        
        # Todo: I should keep a more structured array of info for these alignments.
        # Store this info into an object
        #class columnStats():
        alignmentInfo = AlignmentInfo()
        
        # Keep a running total of adjustments made to the reference.
        # If this total is 0, then theoretically the consensus matches the alignment reference, and we're done.
        totalSequenceAdjustments = 0
        
        # Iterate the reference sequence column by column.
        pileupIterator = bamfile.pileup(alignmentRef.id)
        
        for pileupColumn in pileupIterator:
            
            currentAlignmentColumn = AlignmentColumn()
            #columnResults = None
           # columnResults.name='ll'
            #
            """referencePosition = 0
            referenceBase = ''
            referenceAdjustment = '?'
            alignedCount = 0
            unalignedCount = 0
            matchCount = 0
            mismatchCount = 0
            inCount = 0
            delCount = 0
            aCount = 0
            gCount = 0
            cCount = 0
            tCount = 0"""
            
            currentAlignmentColumn.referencePosition = pileupColumn.reference_pos
            currentAlignmentColumn.referenceBase = alignmentRef[pileupColumn.reference_pos].upper()
            currentAlignmentColumn.alignedCount = pileupColumn.nsegments
            currentAlignmentColumn.unalignedCount = totalReadCount - currentAlignmentColumn.alignedCount
            
            # Iterate the Reads at this position           
            for pileupRead in pileupColumn.pileups:
                
                # If this read is a deletion
                if(pileupRead.is_del == 1):
                    currentAlignmentColumn.delCount += 1
                # else if this read is an insertion
                elif(pileupRead.indel > 0):
                    
                    #print ('INSERTION DETECTED, INDEL=' + str(pileupRead.indel))  
                    currentAlignmentColumn.inCount += 1                   
                # Else if it is a refskip (TODO What does this mean? no read aligned? Count these?)
                elif(pileupRead.is_refskip):
                    print('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name)
                    raise Exception('This read is a refskip, i dont know what that means:' + pileupRead.alignment.query_name)
                # else this means we have a base aligned at this position for this read.
                else:    
                    currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()                    
                    #print('Reference,Current:' + referenceBase + ',' + currentBase)
                    #print('Curr')
                    if(currentBase == currentAlignmentColumn.referenceBase):
                        currentAlignmentColumn.matchCount += 1
                    else:
                        currentAlignmentColumn.mismatchCount += 1
                   
                # Count the nucleotide 
                if (currentBase == 'A'):
                    currentAlignmentColumn.aCount += 1
                elif (currentBase == 'G'):
                    currentAlignmentColumn.gCount += 1
                elif (currentBase == 'C'):
                    currentAlignmentColumn.cCount += 1
                elif (currentBase == 'T'):
                    currentAlignmentColumn.tCount += 1
                else:
                    print('Unknown Base found in Alignment at position ' + str(currentAlignmentColumn.referencePosition) + ':' + currentBase)
                    raise Exception('Unknown Base in Alignment')
                
                
                # TODO: What if the query insertion sequence is longer than one base?
                # Maybe I can only adjust one base per iteration, is that okay? Probably for the Best, actually..
                # Don't worry bout it for now.
            
            # Calculate highest frequency base
            # I hope this algorithm makes sense, probably there is a smarter way to do it.
            if(currentAlignmentColumn.aCount >= currentAlignmentColumn.gCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.aCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'A'
                mostFrequentBaseCount = currentAlignmentColumn.aCount
            elif(currentAlignmentColumn.gCount >= currentAlignmentColumn.cCount and currentAlignmentColumn.gCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'G'
                mostFrequentBaseCount = currentAlignmentColumn.gCount
            elif(currentAlignmentColumn.cCount >= currentAlignmentColumn.tCount):
                mostFrequentBase = 'C'
                mostFrequentBaseCount = currentAlignmentColumn.cCount
            else:
                mostFrequentBase = 'T'
                mostFrequentBaseCount = currentAlignmentColumn.tCount


            
            # Add the next base to the new consensus sequence            
            if (currentAlignmentColumn.matchCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.inCount and currentAlignmentColumn.matchCount >= currentAlignmentColumn.delCount):
                # Aligned bases match the reference, add reference base to the consensus.
                referenceAdjustment='-'
                newConsensusSequence += currentAlignmentColumn.referenceBase
                
            elif (currentAlignmentColumn.inCount >= currentAlignmentColumn.mismatchCount and currentAlignmentColumn.inCount >= currentAlignmentColumn.delCount):
                # Aligned bases show an insertion.
                # Add the Reference Base and the Insertion Base to the consensus.  
                totalSequenceAdjustments += 1 
                referenceAdjustment='I'  
                newConsensusSequence += currentAlignmentColumn.referenceBase + mostFrequentBase         
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Insertion' +
                    '\n(' + str(currentAlignmentColumn.inCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.inCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > ' + currentAlignmentColumn.referenceBase + mostFrequentBase + ')' +
                    '\n')
                
                #TODO: I need to insert multiple bases, if that is waht the alignment suggests.

            elif (currentAlignmentColumn.delCount >= currentAlignmentColumn.mismatchCount):
                # Reads show a deletion.
                # Don't add anything to the consensus.
                totalSequenceAdjustments += 1
                referenceAdjustment='D'
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Deletion' +
                    '\n(' + str(currentAlignmentColumn.delCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * currentAlignmentColumn.delCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > _)' +
                    '\n')
                
            else:
                # Mismatch base.
                # Add the highest read count base to the reference.
                # It might actually be the same base as the reference,
                # Because this just means there are more mismatches than matches.
                # Problematic base, at least we'll notice here.
                # TODO: What to do with highly heterozygous Positions?
                # I should report those that look particularly heterozygous, somewhere.
                newConsensusSequence += mostFrequentBase 
                totalSequenceAdjustments += 1     
                referenceAdjustment='M'   
                
                self.wranglerLog.write(str(currentAlignmentColumn.referencePosition) + ':Mismatch' +
                    '\n(' + str(mostFrequentBaseCount) + '/' + str(currentAlignmentColumn.alignedCount) + ') = ' + str((100.0 * mostFrequentBaseCount) / currentAlignmentColumn.alignedCount) + '% of aligned reads'
                    '\n(' + currentAlignmentColumn.referenceBase + ' > ' + mostFrequentBase + ')' +
                    '\n')
              

            # Write a line to the alignment Summary 
            alignmentSummaryFile.write(str(currentAlignmentColumn.referencePosition) + 
                ',' + str(currentAlignmentColumn.referenceBase) +
                ',' + str(referenceAdjustment) + 
                ',' + str(currentAlignmentColumn.alignedCount) + 
                ',' + str(currentAlignmentColumn.unalignedCount) + 
                ',' + str(currentAlignmentColumn.matchCount) + 
                ',' + str(currentAlignmentColumn.mismatchCount) + 
                ',' + str(currentAlignmentColumn.inCount) + 
                ',' + str(currentAlignmentColumn.delCount) + 
                ',' + str(currentAlignmentColumn.aCount) + 
                ',' + str(currentAlignmentColumn.gCount) + 
                ',' + str(currentAlignmentColumn.cCount) + 
                ',' + str(currentAlignmentColumn.tCount) +
                '\n')
            
            alignmentInfo.alignmentColumns.append(currentAlignmentColumn)
            
        print('\nTotal Sequence Adjustments:' + str(totalSequenceAdjustments) + ' (How many bases the consensus differs from the reference.)\n')    
        
        # Write the newly constructed consensus sequence.
        currentConsensusSequenceFileName = join(alignmentOutputDirectory, 'Consensus.fasta')        
        consensusWriter = createOutputFile(currentConsensusSequenceFileName)          
           
        # TODO: How to i give this a better name? Can I find a gene guess or something?
        sequenceID = "Consensus_Sequence"

        write([SeqRecord(Seq(newConsensusSequence,
            IUPAC.unambiguous_dna),
            id=sequenceID, description="") ], consensusWriter, 'fasta')
        consensusWriter.close()
            
        self.wranglerLog.write('Total Sequence Adjustments:' + str(totalSequenceAdjustments) + '\n')
            
        # Close Summary Files
        alignmentSummaryFile.close()
        #adjustedBasesSummaryFile.close()
        
        return alignmentInfo
Ejemplo n.º 23
0
def sparse_count_reads_in_regions(bamfile,
                                  regions,
                                  storage,
                                  flank=0,
                                  log=None,
                                  template_length=1000,
                                  count_both_ends=False):
    """ This function obtains the counts per bins of equal size
    across the genome.

    The function automatically extracts the genome size from the
    bam file header.
    If group tags are available, they will be used to extract
    the indices from.
    Finally, the function autmatically detects whether the bam-file
    contains paired-end or single-end reads.
    Paired-end reads are counted once at the mid-point between the two
    pairs while single-end reads are counted at the 5' end.
    For paired-end reads it is optionally possible to count both read ends
    by setting count_both_ends=True.

    Parameters
    ----------
    bamfile :  str
        Path to a bamfile. The bamfile must be indexed.
    regions : str
        BED or GFF file containing the regions of interest.
    storage : str
        Path to the output hdf5 file, which contains the counts per chromsome.
    flank : int
        Extension of the regions in base pairs. Default: 0
    template_length : int
        Assumed template length. This is used when counting paired-end reads
        at the mid-point and the individual reads do not overlap with
        the given region, but the mid-point does.
    count_both_ends : bool
        Indicates whether for paired-end sequences, the ends of both mates should
        be counted separately. Default: False.
    """

    # Obtain the header information
    afile = AlignmentFile(bamfile, 'rb')

    # extract genome lengths
    if log is not None:
        f = open(log, 'w')
        fwrite = f.write
    else:
        fwrite = print
    fwrite('Make countmatrix from region\n')
    fwrite('bamfile: {}\n'.format(bamfile))
    fwrite('bedfile: {}\n\n'.format(regions))
    fwrite('get genomesize\n')
    # extract genome size
    genomesize = {}
    for chrom, length in zip(afile.references, afile.lengths):
        genomesize[chrom] = length
    fwrite('found {} chromosomes'.format(len(genomesize)))

    nreg = 0
    regfile = BedTool(regions)

    nreg = len(regfile)

    fwrite('number of regions to collect counts from: {}'.format(nreg))

    if 'RG' in afile.header:
        use_group = True
    else:
        use_group = False

    # get barcodes from header
    barcodes = {}
    if use_group:
        # extract barcodes
        for idx, item in enumerate(afile.header['RG']):
            barcodes[item['ID']] = idx
    else:
        barcodes['dummy'] = 0
    fwrite('found {} barcodes'.format(len(barcodes)))

    # barcode string for final table
    barcode_string = ';'.join([item['ID'] for item in afile.header['RG']])

    sdokmat = dok_matrix((nreg, len(barcodes)), dtype='int32')
    nbarcode_inregions = {key: 0 for key in barcodes}

    if count_both_ends:
        # if both ends are counted, template_length is irrelevant
        tlen = 0
    else:
        tlen = template_length

    for idx, iv in enumerate(regfile):

        iv.start -= flank
        iv.end += flank

        if iv.chrom not in genomesize:
            # skip over peaks/ regions from chromosomes
            # that are not contained in the bam file
            continue

        fetchstart = max(iv.start - tlen, 0)
        fetchend = min(iv.end + tlen, genomesize[iv.chrom])

        for aln in afile.fetch(iv.chrom, fetchstart, fetchend):
            if aln.is_proper_pair and aln.is_read1 and not count_both_ends:

                pos = min(aln.reference_start, aln.next_reference_start)

                # count paired end reads at midpoint
                midpoint = pos + abs(aln.template_length) // 2
                if midpoint >= iv.start and midpoint < iv.end:
                    sdokmat[
                        idx,
                        barcodes[aln.get_tag('RG'
                                             ) if use_group else 'dummy']] += 1
                    nbarcode_inregions[
                        aln.get_tag('RG') if use_group else 'dummy'] += 1

            if not aln.is_paired or count_both_ends:
                # count single-end reads at 5p end
                if not aln.is_reverse:
                    if aln.reference_start >= iv.start and aln.reference_start < iv.end:
                        sdokmat[
                            idx,
                            barcodes[aln.
                                     get_tag('RG'
                                             ) if use_group else 'dummy']] += 1
                else:
                    if aln.reference_start + aln.reference_length - 1 >= iv.start and \
                       aln.reference_start + aln.reference_length - 1 < iv.end:
                        sdokmat[
                            idx,
                            barcodes[aln.
                                     get_tag('RG'
                                             ) if use_group else 'dummy']] += 1
                nbarcode_inregions[
                    aln.get_tag('RG') if use_group else 'dummy'] += 1

    afile.close()

    fwrite('sparse matrix shape: {}'.format(sdokmat.shape))
    fwrite('density: {}'.format(sdokmat.nnz / np.prod(sdokmat.shape)))

    # store the results in COO sparse matrix format
    spcoo = sdokmat.tocoo()
    # sort lexicographically

    order_ = np.lexsort((spcoo.col, spcoo.row))
    indices = np.asarray([x for x in zip(spcoo.row, spcoo.col)],
                         dtype=np.int64)[order_]
    values = spcoo.data.astype(np.float32)[order_]
    cont = {'region': indices[:, 0], 'cell': indices[:, 1], 'count': values}

    df = pd.DataFrame(cont)
    with open(storage, 'w') as title:
        title.write('#  ' + barcode_string + '\n')

    df.to_csv(storage, mode='a', sep='\t', header=True, index=False)

    #main output file

    names = [key for key in barcodes]
    counts = [nbarcode_inregions[key] for key in barcodes]

    df = pd.DataFrame({'barcodes': names, 'counts': counts})

    df.to_csv(storage + '.counts', sep='\t', header=True, index=False)
    fwrite('total number of tags with barcodes: {}'.format(df.counts.sum()))
    if log is not None:
        f.close()
Ejemplo n.º 24
0
def extract_covariates(bam_path: str, reference_path: str, contig: str,
                       start: int, end: int, start_fetch: int, end_fetch: int,
                       filter_kwargs: dict, covariate_kwargs: dict):
    """
    Count mismatches and matches for similar base-calls

    Returns:
        match_mismatch(dict) : dictionary ( covariate_key: [mismatches, matches], .. )
    """
    # known is a set() containing locations of known variation (snps)
    # @todo: extend to indels
    global known  # <- Locations, set of (contig, position) tuples to ignore

    joined = dict()

    # Filters which select which reads are used to estimate covariates:
    min_mapping_quality = filter_kwargs.get('min_mapping_quality', 0)
    deduplicate = filter_kwargs.get('deduplicate', False)
    filter_qcfailed = filter_kwargs.get('filter_qcfailed', False)
    variant_blacklist_vcf_files = filter_kwargs.get(
        'variant_blacklist_vcf_files', None)

    # Obtain all variants in the selected range:
    blacklist = set()
    if variant_blacklist_vcf_files is not None:

        for path in variant_blacklist_vcf_files:
            with pysam.VariantFile(path) as bf:
                for record in bf.fetch(contig, start_fetch, end_fetch):
                    blacklist.add(record.pos)

    with AlignmentFile(bam_path) as alignments, FastaFile(
            reference_path) as fa:
        reference = CachedFasta(fa)  # @todo: prefetch selected region
        for read in alignments.fetch(contig, start_fetch, end_fetch):
            if (deduplicate and read.is_duplicate) or \
                    (read.is_qcfail and filter_qcfailed) or \
                    (read.mapping_quality < min_mapping_quality):
                continue

            for qpos, refpos, refbase in read.get_aligned_pairs(
                    matches_only=True, with_seq=True):

                if refpos > end or refpos < start:  # Prevent the same location to be counted multiple times
                    continue

                if refpos in blacklist:
                    continue

                refbase = refbase.upper()
                if refbase == 'N' or (read.reference_name, refpos) in known:
                    continue

                key = get_covariate_key(read, qpos, refpos, reference, refbase,
                                        **covariate_kwargs)
                if key is None:
                    continue

                matched = (refbase == read.query_sequence[qpos])
                try:
                    joined[key][matched] += 1
                except KeyError:
                    if matched:
                        joined[key] = array('l', [0, 1])
                    else:
                        joined[key] = array('l', [1, 0])
    return joined
Ejemplo n.º 25
0
    def phaseHeterozygousReads(self):
    # TODO: Should this method accept a cluster count?
    # That will break some things. What things?
    # This method is only called from this file, in the summarizeAnalysis method.
    
        print('Splitting reads by heterozygous positions')
        
        # Get a list of reads for later.
        parsedReads = list(parse(self.readInput, self.readInputFormat))
        
        heterozygousConsensusDirectory = join(self.outputRootDirectory,'HeterozygousAlignment')

        # Open the bam file
        print ('opening final alignment_bamfile')
        bamfile = AlignmentFile(join(heterozygousConsensusDirectory,'alignment.bam'), 'rb')  
        
        # Load up the Alignment Reference file, we'll need it.
        alignmentReferenceFileName = join(heterozygousConsensusDirectory,'AlignmentReference.fasta')
        alignmentRef = list(parse(alignmentReferenceFileName, 'fasta'))[0]
     
        # get list of AlignedReads
        print ('Making a list of Aligned Reads.')
        readIDs = []
        for read in parsedReads:
            if not read.id in readIDs:
                readIDs.append(read.id)
        readIDs.sort()

        # Heterozygous base list
        heterozygousBasesSummaryFile = createOutputFile(join(heterozygousConsensusDirectory, 'HeterozygousBases.txt'))
        heterozygousBasesSummaryFile.write('List of Heterozygous Bases (0-based):\n')


        if (self.snps is not None and len(self.snps) > 0):
            # A string of SNPs was passed in, I don't need to calculate them myself.
            # TODO: I could write alignment stats here, like I do when i self-calculate the hetero positions.
            # This is just a simple list of 0-based positions.
            for snp in self.snps:
                heterozygousBasesSummaryFile.write(str(snp) + '\n')

        else:

            # get list of Heterozygous Positions
            # TODO: I suppose I don't need to align 100% of reads to determine heterozygosity.
            # Maybe this would speed up if i use a smaller alignment, or stop the loop after X reads
            print('Getting a list of Heterozygous Positions:')
            self.snps = []
            pileupIterator = bamfile.pileup(alignmentRef.id)
            for pileupColumn in pileupIterator:
                readCount = 0
                matchCount = 0
                mismatchCount = 0
                insCount = 0
                delCount = 0

                # dictionary of base counts.



                referenceBase = alignmentRef.seq[pileupColumn.pos].upper()

                # Iterate the Reads at this position. Each read at each position is either:
                # ins, Del, match, mismatch.

                #TODO: is it possible to exclude secondary/supplemetnary in the pileups method?  No.
                for pileupRead in pileupColumn.pileups:

                    #TODO: Important. Filter secondary / supplementary reads. This is causing problems, these secondary reads are FULL of snps.
                    # Difficulty: these parameters are on an aligned segment.

                    alignedSegmentObject = pileupRead.alignment

                    if(False):
                        pass
                    elif(alignedSegmentObject.is_secondary):
                        #print ('Secondary read at Position ' + str(pileupColumn.pos))
                        pass
                    elif(alignedSegmentObject.is_supplementary):
                        #print ('Supplementary read at Position ' + str(pileupColumn.pos))
                        pass


                    # Just trying some things, not sure what these mean.
                    #elif (alignedSegmentObject.is_unmapped):
                    #    print('UNMAPPED READ!!!!!!!!!!!!!!!!!!! what does that mean?')
                    #elif (alignedSegmentObject.is_qcfail):
                    #    print('This read was a QC failure. What does that mean?????????????')

                    else:
                        readCount += 1
                        # indels
                        if(pileupRead.is_del == 1):
                            delCount += 1
                        elif(pileupRead.indel > 0):
                            insCount += 1
                        else:
                            currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()

                            if(currentBase == referenceBase):
                                matchCount += 1
                            else:
                                mismatchCount += 1


                    # This is a cheap way to stop analysis early. I will only analyze the first 250 reads.
                    # Potential problem: are these reads sorted somehow? Maybe my numbers are biased by only looking at the
                    # first reads
                    # Todo: This is another parameter that can be tuned. Add to inputs? Maybe.
                    maxAnalyzedReadCounts = 1000

                    if(readCount > maxAnalyzedReadCounts):
                        break

                matchProportion =      (1.0 * matchCount / readCount)
                insertionProportion =  (1.0 * insCount / readCount)
                deletionProportion =   (1.0 * delCount / readCount)
                mismatchProportion =   (1.0 * mismatchCount / readCount)

                #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Match/Mismatch : ' + str(matchCount) + '/' + str(mismatchCount))
                #print ('Match Percentage ' + str(matchProportion))

                # TODO: Should accepted match proprtion be a commandline parameter?
                # if > 75% of bases match, this is not a heterzygous position
                baseCutoff = .70

                if(matchProportion > baseCutoff or insertionProportion > baseCutoff or deletionProportion > baseCutoff):
                    pass
                    #print ('Position ' + str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount))
                    #print ('This position does not look heterozygous.')


                # If coverage is very low, we should not use this position
                # This logic is flawed, i think this is never working.
                elif ((1.0 * pileupColumn.n / readCount) < .25):
                    pass

                elif (mismatchProportion > baseCutoff):
                    pass

                # These are the hardcoded values I used for the DRA analysis. Cheating.

                # # I want to write a condition where we don't use the position if it's not clearly polymorphic.
                # #elif (False):
                # #    pass
                # # If the mismatch proportion is too high, what happens? What if there are 2 different bases that are mismatched, like if both my alleles have a different snp from reference. I'll miss that right now.
                #

                # # TEMP, this is very temporary. This is specific to a reference.
                # # TODO : Fix these hard coded values.
                # TODO: I don't really need this code, this is to ignore regions of my DRA reference.
                # Instead, I can pass in a list of 1-based polymorphic positions to sort based on those. A "whitelist" instead of a "blacklist"
                # # In a perfect world....I could tell what positions are heterozygous, but I can't.



                # # I can tell if this sequence is a homopolymer though, but looking at the bases around it.....But that's not the correct thing to do.
                # # I can keep this logic but make it a parameter. Big deletion regions are hard to analyze so I'm just ignoring them for now.
                # elif(5890 <= pileupColumn.pos <= 5970):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # elif (6203 <= pileupColumn.pos <= 6212):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # # Big String of A's
                # elif (774 <= pileupColumn.pos <= 796):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass
                # #Known homopolymer positions....this is terrible programming.
                # # I could at least pass these in ad ignored positions....
                # elif (pileupColumn.pos in (403,430, 1479, 1510, 1683,
                #         1991, 1996, 1997, 2003, 2009, 2093, 2100, 2133, 2134, 2191,
                #         2262, 2289, 2294, 2342, 2449, 2450, 2524, 2647, 2663, 2732,
                #         2895, 2902, 3113, 3114, 3180, 3197, 3362, 3396, 3453, 3542,
                #         3551, 3665, 3832, 3903, 3953, 4108, 4109, 4400, 4639, 4698,
                #         4703, 4769, 4785, 4786, 4828, 4878, 5084, 5301, 5302, 5449,
                #         5575, 5597, 6155, 6279, 6280, 6314, 6375, 6376, 6712, 6755,
                #         6790, 7084, 7631, 7718, 7769, 7971, 7978, 8132, 8133, 8134,
                #         8314, 8315, 8352, 8476, 8477, 8478, 8642, 8650, 8651, 8652,
                #         8653, 8654, 8655, 8656, 8657, 8698, 8725, 8753, 8759
                #         )):
                #     print('WARNING: I am skipping analysis on a region using hardcoded values, check this in allele_wrangler.')
                #     pass


                else:
                    #heterozygousBasesSummaryFile.write (str(pileupColumn.pos) + ', Coverage ' + str(pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(insCount) + '/' + str(matchCount) + '/' + str(mismatchCount) + '\n')
                    heterozygousBasesSummaryFile.write(str(pileupColumn.pos) + ', Coverage ' + str(
                        pileupColumn.n) + ', Deletion/Insertion/Match/Mismatch : ' + str(delCount) + '/' + str(
                        insCount) + '/' + str(matchCount) + '/' + str(mismatchCount)
                        + ' : ' + str(round(deletionProportion,2)) + '/'
                        + str(round(insertionProportion, 2)) + '/'
                        + str(round(matchProportion, 2)) + '/'
                        + str(round(mismatchProportion, 2))
                        + '\n')
                    self.snps.append(pileupColumn.pos)





        heterozygousBasesSummaryFile.close()
            #print ('Pileup Column # ' + str(pileupIterator))

        print('Calculating read distance arrays:')            
        # I'm making this distance array. In this array, a 0 represents a Match.  a 1 represents indels or substitutions.
        # This way I can calculate "distance" in an arbitrary number of dimensions
        # Distance is a euclidian way to represent how far away a read is from the consensus,
        # based on the heterozygous positions.  Each heterozygous position is a "dimension" in this space
        distanceArrays = {}
        for readID in readIDs:
            # TODO: A Bug! Initializing this list as 0s will bias the results.
            # TODO: Pileupcolumn loop is not hitting each read. Only...half sometimes. Some reads are not analyzed.
            # Why? SPOTTED IT! bamfile.pileup has a default to maximum read depth of 8000

            #distanceArrays[readID] = list([999] * len(self.snps))
            distanceArrays[readID] = list([0] * len(self.snps))


        # I spotted the bug!!! pileup defaults to maximum 8000 read depth. That's bad!.
        pileupIterator = bamfile.pileup(alignmentRef.id,max_depth=99999999)
        #pileupIterator = bamfile.pileup(alignmentRef.id)
        for pileupColumn in pileupIterator:
            currentColumn = pileupColumn.pos
            
            # Only do this if the column number exists in our list of heterozygous positions
            if currentColumn in self.snps:
                
                heterozygousPositionIndex = self.snps.index(currentColumn)
                currentAnalyzedReadCount = 0 # A debugging variable, i dont think I actually use this count.
                
                referenceBase = alignmentRef.seq[currentColumn].upper()
                for pileupRead in pileupColumn.pileups:
                    currentAnalyzedReadCount += 1
                    readID = pileupRead.alignment.query_name
                    
                    #print('Pos:' + str(currentColumn) + ', Refbase:' + str(referenceBase) + ', Read:' + str(readID))
                    
                    # In this model, the distance is either 0 or 1. This was intentional but
                    # Maybe we can tune the algorithm using these distances.
                    # This could actually be tuned to do the heterozygous split using ONLY snps.
                    # TODO: if we're having problems splitting based on homopolymers check this spot.
                    # Maybe, I want to count indels as 0, no distance.
                    # TODO: Something to try: indels are -1. SNPS are 1. Match = 0
                    # Maybe that would help the sorting?
                    # TODO: Newest idea. Default to 0. 1 is match, -1 is indels. -1 is mismatches. I think that's it.
                    
                    if(pileupRead.is_del == 1):
                        distanceArrays[readID][heterozygousPositionIndex] = -1
                    elif(pileupRead.indel > 0):
                        distanceArrays[readID][heterozygousPositionIndex] = -1
                    else:   
                        currentBase = pileupRead.alignment.query_sequence[pileupRead.query_position].upper()  
                        if(currentBase == referenceBase):
                            #print('Assinging Match. Column=' + str(currentColumn) + ', CurrentBase:' + str(currentBase) + ', HeterozygousPosIndex=' + str(heterozygousPositionIndex))
                            distanceArrays[readID][heterozygousPositionIndex] = 1
                        else:
                            distanceArrays[readID][heterozygousPositionIndex] = -1

                print('At position ' + str(heterozygousPositionIndex + 1) + ' I analyzed ' + str(currentAnalyzedReadCount) + ' reads.')

        self.printDistanceArrays(distanceArrays, join(self.heterozygousDirectory, 'DistanceArrays.csv'))

        # TODO: Im making 3 clusters. that worked. I need to make a parameter for cluster count.
        clusteredReadIDs = self.clusterReads(distanceArrays, 2)

        # Dictionary of results to return. Key is location of the consensus sequence.
        # Value is the # of reads represented in this consensus alignment.
        coverageResults = {}

        for zeroBasedClusterIndex, readCluster in enumerate(clusteredReadIDs):
            # I want to call the Strand (1 and 2), not Strand (0 and 1).
            clusterIndex = zeroBasedClusterIndex + 1

            clusteredReadIDs = readCluster.keys()

            clusterOutputDir = join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'ClusteredReads')

            distanceArrayFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'DistanceArrays.csv')
            self.printDistanceArrays(readCluster, distanceArrayFileName)

            readOutputFileName = join(clusterOutputDir, 'Strand' + str(clusterIndex) + 'Reads.' + self.readInputFormat)
            readOutputFile = createOutputFile(readOutputFileName)

            # Loop parsed reads, grab reads belonging to this cluster.
            # FYI it looks like each input is clustered in the output, i haven't found a missing read yet. I should still check.
            for readObject in parsedReads:

                #print ('ReadClusterIndex=' + str(zeroBasedClusterIndex))
                #print ('AllReadID=' + str(readObject.id))

                for clusteredReadID in clusteredReadIDs:
                    #print ('clusteredReadID=' + str(clusteredReadID))

                    if (readObject.id == clusteredReadID):
                        write([readObject], readOutputFile, self.readInputFormat)
                        break

            readOutputFile.close()

            currentWranglerObject = AlleleWrangler(
                readOutputFileName
                , join(self.outputRootDirectory, 'Strand' + str(clusterIndex) + 'Alignment')
                , join(self.heterozygousDirectory, 'AlignmentReference.fasta')
                , 6
                , self.numberThreads
                , False
                , self.snps)
            currentCoverageResults = currentWranglerObject.analyzeReads()

            # Merge the dictionaries of coverage values ane return them.
            for key in currentCoverageResults.keys():
                coverageResults[key] = currentCoverageResults[key]

        print ('Done Phasing Reads.')
        return coverageResults
Ejemplo n.º 26
0
#! /usr/bin/env python

import sys
import re
from pysam import AlignmentFile
import pandas as pd

#program to identify reads containing leader sequence and TRS sequences
#from bamfiles aligned to the MHV genome

bamfile = AlignmentFile(file(sys.argv[1]), "rb")

pos = dict()

for read in bamfile.fetch("MHVA59"):

    leader = "TTTAAATCTAA"

    if re.search(leader, read.seq):

        CIGAR = read.cigartuples

        d = dict()

        [
            d[t[0]].append(t[1])
            if t[0] in list(d.keys()) else d.update({t[0]: [t[1]]})
            for t in CIGAR
        ]

        key = read.query_name
Ejemplo n.º 27
0
def build_ref_clusters(data, idx, iregion):
    """
    Given a chunk of regions this will pull in the reference for each region
    and then pull in all consens reads matching to that region. It uses cigar
    info to align the consens reads with the ref. This also merges consens
    from the same sample that were not merged earlier, which is why we expect
    no duplicate samples in the output of reference assemblies.
    """

    # prepare i/o for bamfile with mapped reads
    bamfile = AlignmentFile(
        os.path.join(data.dirs.across, "{}.cat.sorted.bam".format(data.name)),
        'rb')

    # dict to map chromosome names to integers
    faidict = chroms2ints(data, False)

    # prepare i/o for pysam reference indexed
    reffai = FastaFile(data.params.reference_sequence)

    # store path to cluster bit
    outbit = os.path.join(data.tmpdir, "aligned_{}.fa".format(idx))

    # get clusters
    iregions = iter(iregion)
    clusts = []

    while 1:
        # pull in all consens reads mapping to a bed region
        try:
            region = next(iregions)
            reads = bamfile.fetch(*region)
        except StopIteration:
            break

        # build a dict to reference seqs and cigars by name
        mstart = 9e12
        mend = 0
        rdict = {}
        for read in reads:
            rstart = read.reference_start
            rend = rstart + read.qlen
            mstart = min(mstart, rstart)
            mend = max(mend, rend)
            rdict[read.qname] = (read.seq, read.cigar, rstart, rend)
        keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])

        # pull in the reference for this region (1-indexed)
        refs = reffai.fetch(region[0], mstart + 1, mend + 1)

        # make empty array
        rlen = mend - mstart
        arr = np.zeros((len(keys) + 1, rlen), dtype=bytes)
        arr[0] = list(refs.upper())

        # fill arr with remaining samples
        for idx, key in enumerate(keys):
            seq, cigar, start, end = rdict[key]

            # how far ahead of ref start and short of ref end is this read
            fidx = start - mstart
            eidx = arr.shape[1] - (mend - end)

            # enter into the array, trim end if longer than pulled ref
            arr[idx + 1, fidx:eidx] = list(seq)[:eidx - fidx]

            # mod sequence according to cigar for indels and ambigs
            # csums is the location of impute on the seq, so it must be
            # incremented by fidx and not extend past eidx
            for cidx, cig in enumerate(cigar):
                if cig[0] == 4:
                    csums = sum(i[1] for i in cigar[:cidx])
                    csums += eidx
                    if csums < fidx:
                        arr[idx + 1, csums] = arr[idx + 1, csums].lower()
                if cig[0] == 1:
                    csums = sum(i[1] for i in cigar[:cidx])
                    csums += eidx
                    if csums < fidx:
                        arr[idx + 1, csums] = b"-"

        # fill terminal edges with N
        arr[arr == b""] = b"N"

        # duplicates merge here (only perfect merge on all Ns) and reshape
        # the array to match. This will need to be resolved in catgs...
        # if it does not merge then
        try:
            keys, arr = resolve_duplicates(keys, arr)
        except IPyradError:
            pass

        # get consens seq and variant site index
        clust = [
            ">reference_{}:{}:{}-{}\n{}".format(
                0,
                faidict[region[0]] + 1,
                mstart + 1,
                mend + 1,  # 1-indexed
                b"".join(arr[0]).decode())
        ]
        for idx, key in enumerate(keys):
            clust.append(">{}\n{}".format(key,
                                          b"".join(arr[idx + 1]).decode()))
        clusts.append("\n".join(clust))

    # dump to temp file until concat in next step.
    with open(outbit, 'w') as outfile:
        if clusts:
            outfile.write("\n//\n//\n".join(clusts) + "\n//\n//\n")
Ejemplo n.º 28
0
def open_bam_file(file_name):

    try:
        return AlignmentFile(file_name, 'rb')
    except ValueError:
        return open(file_name)
Ejemplo n.º 29
0
    def __init__(self, bam_file):

        from pysam import AlignmentFile

        bam = AlignmentFile(bam_file)
        self.bam_header = bam.header
Ejemplo n.º 30
0
Archivo: io.py Proyecto: dkurt/bonito
class Writer(Thread):

    def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None):
        super().__init__()
        self.fd = fd
        self.log = []
        self.mode = mode
        self.duplex = duplex
        self.aligner = aligner
        self.iterator = iterator
        self.fastq = mode == 'wfq'
        self.group_key = group_key
        self.output = AlignmentFile(
            fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq,
            reference_filename=ref_fn,
            header=AlignmentHeader.from_references(
                reference_names=aligner.seq_names if aligner else [],
                reference_lengths=[
                    len(aligner.seq(name)) for name in aligner.seq_names
                ] if aligner else [],
                text=sam_header(groups),
            )
        )

    def run(self):
        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, res in self.iterator:

                seq = res['sequence']
                qstring = res.get('qstring', '*')
                mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = res.get('mapping', False)
                mods_tags = res.get('mods', [])

                if self.duplex:
                    samples = len(read[0].signal) + len(read[1].signal)
                    read_id = '%s;%s' % (read[0].read_id, read[1].read_id)
                else:
                    samples = len(read.signal)
                    read_id = read.read_id

                tags = [
                    f'RG:Z:{read.run_id}_{self.group_key}',
                    f'qs:i:{round(mean_qscore)}',
                    *read.tagdata(),
                    *mods_tags,
                ]

                if len(seq):
                    if self.mode == 'wfq':
                        write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags)
                    else:
                        self.output.write(
                            AlignedSegment.fromstring(
                                sam_record(read_id, seq, qstring, mapping, tags=tags),
                                self.output.header
                            )
                        )
                    if self.duplex:
                        summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping))
                    else:
                        summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                    self.log.append((read_id, samples))

                else:
                    logger.warn("> skipping empty sequence %s", read_id)
Ejemplo n.º 31
0
def run_process(opts, mutect2_vcf, mutect2_bam):
    outputvcf = opts.output

    # Open VCF, BAM
    m2vcf = VariantFile(mutect2_vcf)
    m2bam = AlignmentFile(mutect2_bam, 'rb')

    old_chrom = ''
    old_pos = -1
    old_ref = ''
    old_alts = ()
    variants_list = list()

    # Get Splited Variants
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        if chrom == old_chrom and pos == old_pos + 1 and len(old_ref) == 1 and len(ref) == 1 and len(alts) == 1:
            tmp_dict = {
                "chrom" : chrom,
                "start_pos" : old_pos,
                "end_pos" : pos,
                "ref" : old_ref + ref,
                "alt" : old_alts[0] + alts[0]
            }
            variants_list.append(tmp_dict)
        old_chrom = chrom
        old_pos = pos
        old_ref = ref
        old_alts = alts

    # Get Read Information
    for v in variants_list:
        reads = m2bam.fetch(v["chrom"], v["start_pos"] - 1, v["end_pos"])
        ref_read_cnt = 0
        alt_read_cnt = 0
        alt_first_cnt = 0
        alt_second_cnt = 0
        f1r2_ref_cnt = 0
        f2r1_ref_cnt = 0
        f1r2_alt_cnt = 0
        f2r1_alt_cnt = 0
        dp = 0
        for read in reads:
            if not read.is_secondary and not read.is_supplementary and not read.is_unmapped and not read.is_duplicate:
                query_position_list = read.get_reference_positions()
                try:
                    q_start_index = query_position_list.index(v["start_pos"]-1)
                    q_end_index = query_position_list.index(v["end_pos"]-1)
                    query_seq = read.query_sequence[q_start_index] + read.query_sequence[q_end_index]
                    if query_seq == v["ref"]:
                        ref_read_cnt += 1
                        if read.is_read1:
                            f1r2_ref_cnt += 1
                        elif read.is_read2:
                            f2r1_ref_cnt += 1
                    elif query_seq == v["alt"]:
                        alt_read_cnt += 1
                        if read.is_read1:
                            f1r2_alt_cnt += 1
                        elif read.is_read2:
                            f2r1_alt_cnt += 1
                    elif query_seq[0] != v["ref"][0] and query_seq[1] == v["ref"][1]:
                        alt_first_cnt += 1
                    elif query_seq[0] == v["ref"][0] and query_seq[1] != v["ref"][1]:
                        alt_second_cnt += 1
                    dp += 1
                except:
                    continue
        v["ref_cnt"] = ref_read_cnt
        v["alt_cnt"] = alt_read_cnt
        v["alt_first_cnt"] = alt_first_cnt
        v["alt_second_cnt"] = alt_second_cnt
        v["f1r2"] = (f1r2_ref_cnt, f1r2_alt_cnt)
        v["f2r1"] = (f2r1_ref_cnt, f2r1_alt_cnt)
        v["dp"] = dp

    # Re-index True:False
    m2vcf_index = 0
    m2vcf_flag = list()
    second_flag = True
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos

        if second_flag == True:
            m2vcf_flag.append(True)
        else:
            m2vcf_flag.append(False)
            second_flag = True

        for v in variants_list:
            if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0:
                if v["alt_first_cnt"] == 0:
                    m2vcf_flag[m2vcf_index] = False
                if v["alt_second_cnt"] == 0:
                    second_flag = False

        m2vcf_index += 1

    # Write Recrod & VCF
    new_header = m2vcf.header
    new_header.formats.add("MDV", "1", "Integer", "Merged Di-Allelic Variant : Backed Phased variant that was splited snp before")
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header)

    m2vcf_index = 0
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos

        if m2vcf_flag[m2vcf_index] == True:
            vcf_out.write(record)

        for v in variants_list:
            if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0:
                record2 = vcf_out.new_record()
                record2.chrom = v["chrom"]
                record2.pos = v["start_pos"]
                record2.ref = v["ref"]
                record2.alts = (v["alt"],)
                record2.info["DP"] = v["dp"]
                if "F1R2" in record2.samples[0]:
                    record2.samples[0]["F1R2"] = v["f1r2"]
                    record2.samples[0]["F2R1"] = v["f2r1"]
                record2.samples[0]["AD"] =  (v["ref_cnt"], v["alt_cnt"])
                record2.samples[0]["DP"] = v["dp"]
                record2.samples[0]["AF"] =  float(v["alt_cnt"]) / float(v["dp"])
                record2.samples[0]["GT"] = ("0", "0")
                record2.samples[0]["MDV"] = True
                vcf_out.write(record2)
            else:
                continue

        m2vcf_index += 1
Ejemplo n.º 32
0
Archivo: io.py Proyecto: dkurt/bonito
class CTCWriter(Thread):
    """
    CTC writer process that writes output numpy training data.
    """
    def __init__(
            self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90,
            min_accuracy=0.99, ref_fn=None, groups=None
    ):
        super().__init__()
        self.fd = fd
        self.log = []
        self.mode = mode
        self.aligner = aligner
        self.iterator = iterator
        self.min_coverage = min_coverage
        self.min_accuracy = min_accuracy
        self.output = AlignmentFile(
            fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq',
            reference_filename=ref_fn,
            header=AlignmentHeader.from_references(
                reference_names=aligner.seq_names,
                reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names],
                text=sam_header(groups),
            )
        )

    def run(self):

        chunks = []
        targets = []
        lengths = []

        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, ctc_data in self.iterator:

                seq = ctc_data['sequence']
                qstring = ctc_data['qstring']
                mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = ctc_data.get('mapping', False)

                self.log.append((read.read_id, len(read.signal)))

                if len(seq) == 0 or mapping is None:
                    continue

                cov = (mapping.q_en - mapping.q_st) / len(seq)
                acc = mapping.mlen / mapping.blen
                refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en)

                if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq:
                    continue

                self.output.write(
                    AlignedSegment.fromstring(
                        sam_record(read.read_id, seq, qstring, mapping),
                        self.output.header
                    )
                )
                summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                if mapping.strand == -1:
                    refseq = mappy.revcomp(refseq)

                target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})]
                targets.append(target)
                chunks.append(read.signal)
                lengths.append(len(target))

        if len(chunks) == 0:
            sys.stderr.write("> no suitable ctc data to write\n")
            return

        chunks = np.array(chunks, dtype=np.float16)
        targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8)
        for idx, target in enumerate(targets): targets_[idx, :len(target)] = target
        lengths = np.array(lengths, dtype=np.uint16)
        indices = np.random.permutation(typical_indices(lengths))

        chunks = chunks[indices]
        targets_ = targets_[indices]
        lengths = lengths[indices]

        summary = pd.read_csv(summary_file(), sep='\t')
        summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False)

        output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1'))
        np.save(os.path.join(output_directory, "chunks.npy"), chunks)
        np.save(os.path.join(output_directory, "references.npy"), targets_)
        np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths)

        sys.stderr.write("> written ctc training data\n")
        sys.stderr.write("  - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape)))
        sys.stderr.write("  - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape)))
        sys.stderr.write("  - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape)))

    def stop(self):
        self.join()
Ejemplo n.º 33
0
    def __init__(self, bam_file):

        bam = AlignmentFile(bam_file)
        self.bam_header = bam.header
Ejemplo n.º 34
0
def open_bamfile(sam):
    from pysam import AlignmentFile
    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    return AlignmentFile(sam, mode=sam_mode)
Ejemplo n.º 35
0
import sys
from pysam import AlignmentFile
from argparse import ArgumentParser

valid_spliced_reads=0
problem_reads=0

parser = ArgumentParser()
parser.add_argument('infile', nargs='?', default='-')
parser.add_argument('outfile', nargs='?', default='-')
args = parser.parse_args()
infile = AlignmentFile(args.infile, 'r')
outfile = AlignmentFile(args.outfile, 'wh', template=infile)

for read in infile:
    splice_len = 0
    min_edge = 1e6
    if read.mapping_quality < 10: continue
    for cig_op, cig_len in read.cigartuples:
        if cig_op == 3: # N
            splice_len += cig_len
        elif cig_op == 0:
            min_edge = min(min_edge, cig_len)
    if splice_len > 50 and min_edge >= 6:
        outfile.write(read)
        valid_spliced_reads += 1
        if valid_spliced_reads % 100000 == 0:
            sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )
sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )

Ejemplo n.º 36
0
class _BamReaderBase(ReaderBase):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames   = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s    = [r["M5"] for r in refRecords]
        refIds = map(self.peer.get_tid, refNames)
        nRefs = len(refRecords)

        if nRefs > 0:
            self._referenceInfoTable = np.rec.fromrecords(zip(
                refIds,
                refIds,
                refNames,
                refNames,
                refLengths,
                refMD5s,
                np.zeros(nRefs, dtype=np.uint32),
                np.zeros(nRefs, dtype=np.uint32)),
                dtype=[('ID', '<i8'), ('RefInfoID', '<i8'),
                       ('Name', 'O'), ('FullName', 'O'),
                       ('Length', '<i8'), ('MD5', 'O'),
                       ('StartRow', '<u4'), ('EndRow', '<u4')])
            self._referenceDict = {}
            self._referenceDict.update(zip(refIds, self._referenceInfoTable))
            self._referenceDict.update(zip(refNames, self._referenceInfoTable))
        else:
            self._referenceInfoTable = None
            self._referenceDict = None

    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []
        self._featureNameMappings = {}  # RGID -> ("abstract feature name" -> actual feature name)

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            rgFrameRate = ds["FRAMERATEHZ"]
            readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            featureNameMapping = { key.split(":")[0] : key
                                   for key in ds.keys()
                                   if key in PULSE_FEATURE_TAGS }
            self._featureNameMappings[rgID] = featureNameMapping

        self._readGroupTable = np.rec.fromrecords(
            readGroupTable_,
            dtype=[("ID"                 , np.int32),
                   ("MovieName"          , "O"),
                   ("ReadType"           , "O"),
                   ("SequencingChemistry", "O"),
                   ("FrameRate",           float)])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = { rg.ID : rg
                                for rg in self._readGroupTable }

        # The pulse features "available" to clients of this file are the intersection
        # of pulse features available from each read group.
        self._pulseFeaturesAvailable = set.intersection(
            *[set(mapping.keys()) for mapping in self._featureNameMappings.values()])

    def _loadProgramInfo(self):
        pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
                      for pg in self.peer.header.get("PG", []) ]

        if len(pgRecords) > 0:
            self._programTable = np.rec.fromrecords(
                pgRecords,
                dtype=[("ID"     ,     "O"),
                       ("Version",     "O"),
                       ("CommandLine", "O")])
        else:
            self._programTable = None

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, len(c)) for c in ft)
        bamIdsAndLens   = set((c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def _checkFileCompatibility(self):
        # Verify that this is a "pacbio" BAM file of version at least
        # 3.0.1
        try:
            checkedVersion = self.version
            if "b" in checkedVersion:
                raise Exception()
            else:
                major, minor, patch = checkedVersion.split('.')
                assert major >= 3
                assert minor >= 0
                assert patch >= 1
        except:
            raise IncompatibleFile(
                "This BAM file is incompatible with this API " +
                "(only PacBio BAM files version >= 3.0.1 are supported)")

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    @property
    def isUnmapped(self):
        return not(self.isMapped)

    @property
    def isMapped(self):
        return len(self.peer.header["SQ"]) > 0

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroupInfo(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    @property
    def version(self):
        return self.peer.header["HD"]["pb"]

    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return self.peer.header["HD"]["SO"] == "coordinate"

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    @requiresBai
    def __len__(self):
        return self.peer.mapped + self.peer.unmapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
Ejemplo n.º 37
0
def write_matrix(inbam,
                 resolution,
                 biases,
                 outdir,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 normalizations=('decay', ),
                 region1=None,
                 start1=None,
                 end1=None,
                 clean=True,
                 region2=None,
                 start2=None,
                 end2=None,
                 extra='',
                 half_matrix=True,
                 nchunks=100,
                 tmpdir='.',
                 append_to_tar=None,
                 ncpus=8,
                 cooler=False,
                 row_names=False,
                 chr_order=None,
                 verbose=True):
    """
    Writes matrix file from a BAM file containing interacting reads. The matrix
    will be extracted from the genomic BAM, the genomic coordinates of this
    matrix will be at the intersection of two regions defined byt the parameters
    region1, start1, end1 and region2, start2, end2. If the wanted matrix is
    than the second coodinate can be skipped.

    :param inbam: path to BAM file (generated byt TADbit)
    :param resolution: resolution at which we want to write the matrix
    :param biases: path to a file with biases
    :param outdir: path to a folder where to write output files
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param ('decay',) normalization: tuple with normalizations to use, can be 'decay',
       'norm' or/and 'raw'. One file per normalization will be created.
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None region1: chromosome name of the first region from which to
       extract the matrix
    :param None start1: start coordinate of the first region from which to
       extract the matrix
    :param None end1: end coordinate of the first region from which to
       extract the matrix
    :param None region2: chromosome name of the second region from which to
       extract the matrix
    :param None start2: start coordinate of the second region from which to
       extract the matrix
    :param None end2: end coordinate of the second region from which to
       extract the matrix
    :param True half_matrix: writes only half of the matrix (and the diagonal)
    :param '.' tmpdir: where to write temporary files
    :param None append_to_tar: path to a TAR file were generated matrices will
       be written directly
    :param 8 ncpus: number of cpus to use to read the BAM file
    :param True verbose: speak
    :param False row_names: Writes geneomic coocrdinates instead of bins.
       WARNING: results in two extra columns
    :param None chr_order: chromosome order
    :param 100 nchunks: maximum number of chunks into which to cut the BAM

    :returns: path to output files
    """
    if start1 is not None and end1:
        if end1 - start1 < resolution:
            raise Exception(
                'ERROR: region1 should be at least as big as resolution')
    if start2 is not None and end2:
        if end2 - start2 < resolution:
            raise Exception(
                'ERROR: region2 should be at least as big as resolution')

    if isinstance(normalizations, list):
        normalizations = tuple(normalizations)
    elif isinstance(normalizations, basestring):
        normalizations = tuple([normalizations])

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    regions, rand_hash, bin_coords, chunks = read_bam(inbam,
                                                      filter_exclude,
                                                      resolution,
                                                      ncpus=ncpus,
                                                      region1=region1,
                                                      start1=start1,
                                                      end1=end1,
                                                      region2=region2,
                                                      start2=start2,
                                                      end2=end2,
                                                      tmpdir=tmpdir,
                                                      nchunks=nchunks,
                                                      chr_order=chr_order,
                                                      verbose=verbose)

    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        list(zip(bamfile.references, [x for x in bamfile.lengths])))

    start_bin1, end_bin1, start_bin2, end_bin2 = bin_coords

    section_pos1 = {}
    section_pos2 = {}
    totals = OrderedDict()
    total_num = 0
    for c in sections:
        totals[c] = total_num
        total_num += sections[c] / resolution + 1

    if row_names:
        if len(regions) in [1, 2]:
            offset = start_bin1 - totals[regions[0]]
            section_pos1 = dict((i, (region1, resolution * (i + offset)))
                                for i in range(end_bin1 - start_bin1))
            if region2:
                offset = start_bin2 - totals[regions[1]]
                section_pos2 = dict((i, (region2, resolution * (i + offset)))
                                    for i in range(end_bin2 - start_bin2))
        else:
            section_pos1 = dict((v + i, (c, i)) for c, v in totals.iteritems()
                                for i in range(sections[c] // resolution + 1))
            section_pos2 = section_pos1

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(
            biases, bin_coords)
    elif normalizations != ('raw', ):
        raise Exception(
            'ERROR: should provide path to file with biases (pickle).')
    else:
        bads1 = bads2 = {}

    if verbose:
        printime('  - Writing matrices')
    # define output file name
    name = _generate_name(regions, (start1, start2), (end1, end2), resolution,
                          chr_order)

    # prepare file header
    outfiles = []
    if cooler:
        if 'h5py' not in modules:
            raise Exception(
                'ERROR: cooler output is not available. Probably ' +
                'you need to install h5py\n')
        if 'decay' in normalizations or 'raw&decay' in normalizations:
            raise Exception(
                'ERROR: decay and raw&decay matrices cannot be exported '
                'to cooler format. Cooler only accepts weights per column/row')
        fnam = 'raw_%s_%s%s.mcool' % (name, nicer(resolution).replace(' ', ''),
                                      ('_' + extra) if extra else '')
        if os.path.exists(os.path.join(outdir, fnam)):
            os.remove(os.path.join(outdir, fnam))
        out_raw = cooler_file(os.path.join(outdir, fnam), resolution, sections,
                              regions)
        out_raw.create_bins()
        out_raw.prepare_matrix(start_bin1, start_bin2)
        outfiles.append((os.path.join(outdir, fnam), fnam))
    else:
        if 'raw' in normalizations:
            fnam = 'raw_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_raw = StringIO()
                outfiles.append((out_raw, fnam))
            else:
                out_raw = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_raw.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_raw.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_raw.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_raw.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_raw.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))

        # write file header
        if 'norm' in normalizations:
            fnam = 'nrm_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_nrm = StringIO()
                outfiles.append((out_nrm, fnam))
            else:
                out_nrm = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_nrm.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_nrm.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_nrm.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_nrm.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_nrm.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
        if 'decay' in normalizations or 'raw&decay' in normalizations:
            fnam = 'dec_%s_%s%s.abc' % (name, nicer(resolution).replace(
                ' ', ''), ('_' + extra) if extra else '')
            if append_to_tar:
                out_dec = StringIO()
                outfiles.append((out_dec, fnam))
            else:
                out_dec = open(os.path.join(outdir, fnam), 'w')
                outfiles.append((os.path.join(outdir, fnam), fnam))
            for reg in regions:
                out_dec.write('# CRM %s\t%d\n' % (reg, sections[reg]))

            out_dec.write('# %s resolution:%d\n' % (name, resolution))
            if region2:
                out_dec.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                out_dec.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
            else:
                out_dec.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))

    # functions to write lines of pairwise interactions
    def write_raw(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_raw.write('{}\t{}\n'.format(get_name(a, b), v))

        def writer(_, a, b, v):
            out_raw.write('{}\t{}\n'.format(get_name(a, b), v))

        return writer2 if func else writer

    def write_bias(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_nrm.write('{}\t{}\n'.format(get_name(a, b),
                                            v / bias1[a] / bias2[b]))

        def writer(_, a, b, v):
            out_nrm.write('{}\t{}\n'.format(get_name(a, b),
                                            v / bias1[a] / bias2[b]))

        return writer2 if func else writer

    def write_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\n'.format(
                get_name(a, b),
                v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))

        def writer(c, a, b, v):
            out_dec.write('{}\t{}\n'.format(
                get_name(a, b),
                v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))

        return writer2 if func else writer

    def write_expc_2reg(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            out_dec.write('{}\t{}\n'.format(
                get_name(a, b), v / bias1[a] / bias2[b] /
                decay[c][abs((a + start_bin1) - (b + start_bin2))]))

        def writer(c, a, b, v):
            out_dec.write('{}\t{}\n'.format(
                get_name(a, b), v / bias1[a] / bias2[b] /
                decay[c][abs((a + start_bin1) - (b + start_bin2))]))

        return writer2 if func else writer

    def write_expc_err(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\n'.format(
                    get_name(a, b),
                    v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\n'.format(get_name(a, b), 'nan'))

        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\n'.format(
                    get_name(a, b),
                    v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\n'.format(get_name(a, b), 'nan'))

        return writer2 if func else writer

    def write_raw_and_expc(func=None):
        def writer2(c, a, b, v):
            func(c, a, b, v)
            try:
                out_dec.write('{}\t{}\n'.format(
                    get_name(a, b), v,
                    v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\n'.format(get_name(a, b), v,
                                                v / bias1[a] / bias2[b]))

        def writer(c, a, b, v):
            try:
                out_dec.write('{}\t{}\n'.format(
                    get_name(a, b), v,
                    v / bias1[a] / bias2[b] / decay[c][abs(a - b)]))
            except KeyError:  # different chromosomes
                out_dec.write('{}\t{}\n'.format(get_name(a, b), v,
                                                v / bias1[a] / bias2[b]))

        return writer2 if func else writer

    def get_row_name(a, b):
        return '{}\t{}\t{}\t{}\t'.format(*(section_pos1[a] + section_pos2[b]))

    def get_bin_name(a, b):
        return '{}\t{}\t'.format(a, b)

    get_name = get_row_name if row_names else get_bin_name

    write = None
    if 'raw' in normalizations:
        write = write_raw(write)
    if 'norm' in normalizations and not cooler:
        write = write_bias(write)
    if 'decay' in normalizations and not cooler:
        if len(regions) in [1, 2]:
            if region2:
                write = write_expc_2reg(write)
            else:
                write = write_expc(write)
        else:
            write = write_expc_err(write)
    if 'raw&decay' in normalizations and not cooler:
        write = write_raw_and_expc(write)

    # pull all sub-matrices and write full matrix
    if region2 is not None:  # already half-matrix in this case
        half_matrix = False

    if cooler:
        for ichunk, c, j, k, v in _iter_matrix_frags(chunks,
                                                     tmpdir,
                                                     rand_hash,
                                                     verbose=verbose,
                                                     clean=clean,
                                                     include_chunk_count=True):
            if j > k:
                continue
            if j not in bads1 and k not in bads2:
                out_raw.write_iter(ichunk, j, k, v)
        out_raw.close()
    else:
        if half_matrix:
            for c, j, k, v in _iter_matrix_frags(chunks,
                                                 tmpdir,
                                                 rand_hash,
                                                 verbose=verbose,
                                                 clean=clean):
                if k > j:
                    continue
                if j not in bads1 and k not in bads2:
                    write(c, j, k, v)
        else:
            for c, j, k, v in _iter_matrix_frags(chunks,
                                                 tmpdir,
                                                 rand_hash,
                                                 verbose=verbose,
                                                 clean=clean):
                if j not in bads1 and k not in bads2:
                    write(c, j, k, v)

    fnames = {}
    if append_to_tar:
        lock = LockFile(append_to_tar)
        with lock:
            archive = taropen(append_to_tar, "a:")
            for fobj, fnam in outfiles:
                fobj.seek(0)
                info = archive.tarinfo(name=fnam)
                info.size = len(fobj.buf)
                archive.addfile(tarinfo=info, fileobj=fobj)
            archive.close()
    else:
        if cooler:
            fnames['RAW'] = out_raw.name
            if 'norm' in normalizations:
                fnam = 'nrm_%s_%s%s.mcool' % (name, nicer(resolution).replace(
                    ' ', ''), ('_' + extra) if extra else '')
                copyfile(outfiles[0][0], os.path.join(outdir, fnam))
                out_nrm = cooler_file(os.path.join(outdir, fnam), resolution,
                                      sections, regions)
                bias_data_row = [1. / b if b > 0 else 0 for b in bias1]
                bias_data_col = [1. / b if b > 0 else 0 for b in bias2]
                out_nrm.write_weights(bias_data_row, bias_data_col,
                                      *bin_coords)
                outfiles.append((os.path.join(outdir, fnam), fnam))
                fnames['NRM'] = os.path.join(outdir, fnam)
        else:
            if 'raw' in normalizations:
                out_raw.close()
                fnames['RAW'] = out_raw.name
            if 'norm' in normalizations:
                out_nrm.close()
                fnames['NRM'] = out_nrm.name
            if 'decay' in normalizations:
                out_dec.close()
                fnames['DEC'] = out_dec.name
            if 'raw&decay' in normalizations:
                out_dec.close()
                fnames['RAW&DEC'] = out_dec.name

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' %
                                              (rand_hash))))

    return fnames
Ejemplo n.º 38
0
def load_bam(bam_path):
    return AlignmentFile(data_path(bam_path))
Ejemplo n.º 39
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             ncpus=8,
             region1=None,
             start1=None,
             end1=None,
             region2=None,
             start2=None,
             end2=None,
             nchunks=100,
             tmpdir='.',
             verbose=True,
             normalize=False,
             max_size=None,
             chr_order=None):

    bamfile = AlignmentFile(inbam, 'rb')
    bam_refs = bamfile.references
    bam_lengths = bamfile.lengths
    if chr_order:
        bam_refs_idx = [
            bam_refs.index(chr_ord) for chr_ord in chr_order
            if chr_ord in bam_refs
        ]
        if not bam_refs_idx:
            raise Exception('''ERROR: Wrong number of chromosomes in chr_order.
                Found %s in bam file \n''' % (' '.join(bam_refs)))
        bam_refs = [
            bam_ref for bam_ref in
            [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
        ]
        bam_lengths = [
            bam_len for bam_len in
            [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
        ]
    sections = OrderedDict(
        list(zip(bam_refs, [x // resolution + 1 for x in bam_lengths])))
    # get chromosomes and genome sizes
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    # define genomic bins
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in range(len_crm)])
    if not bins:
        raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm))

    # define start, end position of region to grab
    start_bin1 = 0
    end_bin1 = len(bins) + 1
    regions = bam_refs
    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)
    else:
        total = len(bins)
        if start1 is not None or end1:
            raise Exception('ERROR: Cannot use start/end1 without region')

    if start1 is not None:
        start_bin1 = section_pos[region1][0] + start1 // resolution
    else:
        if region1:
            start_bin1 = section_pos[region1][0]
        else:
            start_bin1 = 0
        start1 = 0
    if end1 is not None:
        end_bin1 = section_pos[region1][0] + end1 // resolution
    else:
        if region1:
            end_bin1 = section_pos[region1][1]
            end1 = sections[region1] * resolution
        else:
            end_bin1 = total
            end1 = total * resolution

    # define chunks, using at most 100 sub-divisions of region1
    total = end_bin1 - start_bin1
    regs = []
    begs = []
    ends = []

    njobs = min(total, nchunks) + 1

    nbins = total // njobs + 1
    for i in range(start_bin1, end_bin1, nbins):
        if i + nbins > end_bin1:  # make sure that we stop at the right place
            nbins = end_bin1 - i
        try:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
        if crm1 != crm2:
            fin1 = sections[crm1]
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin1 * resolution + resolution)  # last nt included
            # be sure we don't miss regions in between the start and end bins
            start_chunk = i + fin1 - beg1
            end_chunk = i + nbins - 1 if i + nbins - 1 < len(
                bins) else len(bins) - 1
            (crm1, beg1) = bins[start_chunk]
            fin1 = beg1
            for j in range(start_chunk, end_chunk + 1):
                (crm2, beg2) = bins[j]
                if crm1 == crm2:
                    fin1 = beg2
                    continue
                regs.append(crm1)
                begs.append(beg1 * resolution)
                fin1 = sections[crm1]
                ends.append(
                    fin1 * resolution + resolution -
                    1)  # last nt not included (overlap with next window)
                (crm1, beg1) = (crm2, beg2)
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin1 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included
    # reduce dictionaries
    all_bins = []
    seenbins = set()
    for crm in regions:
        beg_crm = section_pos[crm][0]
        if region1:
            start = start_bin1 - beg_crm
            end = end_bin1 - beg_crm
        else:
            start = 0
            end = section_pos[crm][1] - section_pos[crm][0]
        all_bins.extend([(crm, i) for i in range(start, end)
                         if not (crm, i) in seenbins])
        seenbins = set(all_bins)
    del (seenbins)

    bins_dict1 = dict((j, i) for i, j in enumerate(all_bins))
    if region2:
        if not region2 in section_pos:
            raise Exception('ERROR: chromosome %s not found' % region2)
        bins = []
        beg_crm = section_pos[region2][0]
        if start2 is not None:
            start_bin2 = section_pos[region2][0] + start2 // resolution
        else:
            start_bin2 = section_pos[region2][0]
            start2 = 0
        if end2 is not None:
            end_bin2 = section_pos[region2][0] + end2 // resolution
        else:
            end_bin2 = section_pos[region2][1]
            end2 = sections[region2] * resolution
        start = start_bin2 - beg_crm
        end = end_bin2 - beg_crm
        bins = [(region2, i) for i in range(start, end)]
        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
    else:
        start_bin2 = start_bin1
        end_bin2 = end_bin1
        bins_dict2 = bins_dict1

    size1 = end_bin1 - start_bin1
    size2 = end_bin2 - start_bin2
    if verbose:
        printime('\n  (Matrix size %dx%d)' % (size1, size2))
    if max_size and max_size < size1 * size2:
        raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most '
                         '{2}x{2}').format(size1, size2, int(max_size**0.5)))

    pool = mu.Pool(ncpus)
    # create random hash associated to the run:
    rand_hash = "%016x" % getrandbits(64)

    ## RUN!
    if verbose:
        printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))
    # empty all_bins array if we are not going to normalize
    if not normalize:
        all_bins = []
    procs = []
    for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
        if ncpus == 1:
            _read_bam_frag(
                inbam,
                filter_exclude,
                all_bins,
                bins_dict1,
                bins_dict2,
                rand_hash,
                resolution,
                tmpdir,
                region,
                b,
                e,
            )
        else:
            procs.append(
                pool.apply_async(_read_bam_frag,
                                 args=(
                                     inbam,
                                     filter_exclude,
                                     all_bins,
                                     bins_dict1,
                                     bins_dict2,
                                     rand_hash,
                                     resolution,
                                     tmpdir,
                                     region,
                                     b,
                                     e,
                                 )))
    pool.close()
    if verbose:
        print_progress(procs)
    pool.join()
    bin_coords = start_bin1, end_bin1, start_bin2, end_bin2
    chunks = regs, begs, ends
    return regions, rand_hash, bin_coords, chunks
Ejemplo n.º 40
0
class _BamReaderBase(ReaderBase):
    """
    The BamReader class provides a high-level interface to PacBio BAM
    files.  If a PacBio BAM index (bam.pbi file) is present and the
    user instantiates the BamReader using the reference FASTA as the
    second argument, the BamReader will provide an interface
    compatible with CmpH5Reader.
    """
    def _loadReferenceInfo(self):
        refRecords = self.peer.header["SQ"]
        refNames = [r["SN"] for r in refRecords]
        refLengths = [r["LN"] for r in refRecords]
        refMD5s = [r["M5"] for r in refRecords]
        refIds = map(self.peer.get_tid, refNames)
        nRefs = len(refRecords)

        if nRefs > 0:
            self._referenceInfoTable = np.rec.fromrecords(
                zip(refIds, refIds, refNames, refNames, refLengths, refMD5s,
                    np.zeros(nRefs, dtype=np.uint32),
                    np.zeros(nRefs, dtype=np.uint32)),
                dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'),
                       ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'),
                       ('StartRow', '<u4'), ('EndRow', '<u4')])
            self._referenceDict = {}
            self._referenceDict.update(zip(refIds, self._referenceInfoTable))
            self._referenceDict.update(zip(refNames, self._referenceInfoTable))
        else:
            self._referenceInfoTable = None
            self._referenceDict = None

    def _loadReadGroupInfo(self):
        rgs = self.peer.header["RG"]
        readGroupTable_ = []

        # RGID -> ("abstract feature name" -> actual feature name)
        self._baseFeatureNameMappings = {}
        self._pulseFeatureNameMappings = {}

        for rg in rgs:
            rgID = rgAsInt(rg["ID"])
            rgName = rg["PU"]
            ds = dict([
                pair.split("=") for pair in rg["DS"].split(";") if pair != ""
            ])
            # spec: we only consider first two components of basecaller version
            # in "chem" lookup
            basecallerVersion = ".".join(
                ds["BASECALLERVERSION"].split(".")[0:2])
            triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
            rgChem = decodeTriple(*triple)
            rgReadType = ds["READTYPE"]
            rgFrameRate = ds["FRAMERATEHZ"]

            # Look for the features manifest entries within the DS tag,
            # and build an "indirection layer", i.e. to get from
            # "Ipd"  to "Ipd:Frames"
            # (This is a bit messy.  Can we separate the manifest from
            # the rest of the DS content?)
            baseFeatureNameMapping = {
                key.split(":")[0]: key
                for key in ds.keys() if key in BASE_FEATURE_TAGS
            }
            pulseFeatureNameMapping = {
                key.split(":")[0]: key
                for key in ds.keys() if key in PULSE_FEATURE_TAGS
            }
            self._baseFeatureNameMappings[rgID] = baseFeatureNameMapping
            self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping

            readGroupTable_.append(
                (rgID, rgName, rgReadType, rgChem, rgFrameRate,
                 frozenset(baseFeatureNameMapping.iterkeys())))

        self._readGroupTable = np.rec.fromrecords(readGroupTable_,
                                                  dtype=[
                                                      ("ID", np.int32),
                                                      ("MovieName", "O"),
                                                      ("ReadType", "O"),
                                                      ("SequencingChemistry",
                                                       "O"),
                                                      ("FrameRate", float),
                                                      ("BaseFeatures", "O")
                                                  ])
        assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \
            "First 8 chars of read group IDs must be unique!"

        self._readGroupDict = {rg.ID: rg for rg in self._readGroupTable}

        # The base/pulse features "available" to clients of this file are the intersection
        # of features available from each read group.
        self._baseFeaturesAvailable = set.intersection(*[
            set(mapping.keys())
            for mapping in self._baseFeatureNameMappings.values()
        ])
        self._pulseFeaturesAvailable = set.intersection(*[
            set(mapping.keys())
            for mapping in self._pulseFeatureNameMappings.values()
        ])

    def _loadProgramInfo(self):
        pgRecords = [(pg["ID"], pg.get("VN", None), pg.get("CL", None))
                     for pg in self.peer.header.get("PG", [])]

        if len(pgRecords) > 0:
            self._programTable = np.rec.fromrecords(pgRecords,
                                                    dtype=[("ID", "O"),
                                                           ("Version", "O"),
                                                           ("CommandLine", "O")
                                                           ])
        else:
            self._programTable = None

    def _loadReferenceFasta(self, referenceFastaFname):
        ft = FastaTable(referenceFastaFname)
        # Verify that this FASTA is in agreement with the BAM's
        # reference table---BAM should be a subset.
        fastaIdsAndLens = set((c.id, len(c)) for c in ft)
        bamIdsAndLens = set(
            (c.Name, c.Length) for c in self.referenceInfoTable)
        if not bamIdsAndLens.issubset(fastaIdsAndLens):
            raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM"
        self.referenceFasta = ft

    def _checkFileCompatibility(self):
        # Verify that this is a "pacbio" BAM file of version at least
        # 3.0.1
        badVersionException = IncompatibleFile(
            "This BAM file is incompatible with this API " +
            "(only PacBio BAM files version >= 3.0.1 are supported)")
        checkedVersion = self.version
        if "b" in checkedVersion:
            raise badVersionException
        else:
            major, minor, patch = checkedVersion.split('.')
            if not (major, minor, patch) >= (3, 0, 1):
                raise badVersionException

    def __init__(self, fname, referenceFastaFname=None):
        self.filename = fname = abspath(expanduser(fname))
        self.peer = AlignmentFile(fname, "rb", check_sq=False)
        self._checkFileCompatibility()

        self._loadReferenceInfo()
        self._loadReadGroupInfo()
        self._loadProgramInfo()

        self.referenceFasta = None
        if referenceFastaFname is not None:
            if self.isUnmapped:
                raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader"
            self._loadReferenceFasta(referenceFastaFname)

    @property
    def isIndexLoaded(self):
        return self.index is not None

    @property
    def isReferenceLoaded(self):
        return self.referenceFasta is not None

    @property
    def isUnmapped(self):
        return not (self.isMapped)

    @property
    def isMapped(self):
        return len(self.peer.header["SQ"]) > 0

    @property
    def alignmentIndex(self):
        raise UnavailableFeature("BAM has no alignment index")

    @property
    def movieNames(self):
        return set([mi.MovieName for mi in self.readGroupTable])

    @property
    def readGroupTable(self):
        return self._readGroupTable

    def readGroupInfo(self, readGroupId):
        return self._readGroupDict[readGroupId]

    @property
    def sequencingChemistry(self):
        """
        List of the sequencing chemistries by movie.  Order is
        unspecified.
        """
        return list(self.readGroupTable.SequencingChemistry)

    @property
    def referenceInfoTable(self):
        return self._referenceInfoTable

    #TODO: standard?  how about subread instead?  why capitalize ccs?
    # can we standardize this?  is cDNA an additional possibility
    @property
    def readType(self):
        """
        Either "standard", "CCS", "mixed", or "unknown", to represent the
        type of PacBio reads aligned in this BAM file.
        """
        readTypes = self.readGroupTable.ReadType
        if all(readTypes == "SUBREAD"):
            return "standard"
        elif all(readTypes == "CCS"):
            return "CCS"
        elif all((readTypes == "CCS") | (readTypes == "SUBREAD")):
            return "mixed"
        else:
            return "unknown"

    @property
    def version(self):
        return self.peer.header["HD"]["pb"]

    def versionAtLeast(self, minimalVersion):
        raise Unimplemented()

    def softwareVersion(self, programName):
        raise Unimplemented()

    @property
    def isSorted(self):
        return self.peer.header["HD"]["SO"] == "coordinate"

    @property
    def isBarcoded(self):
        raise Unimplemented()

    @property
    def isEmpty(self):
        return (len(self) == 0)

    def referenceInfo(self, key):
        return self._referenceDict[key]

    def atOffset(self, offset):
        self.peer.seek(offset)
        return BamAlignment(self, next(self.peer))

    def hasBaseFeature(self, featureName):
        return featureName in self._baseFeaturesAvailable

    def baseFeaturesAvailable(self):
        return self._baseFeaturesAvailable

    def hasPulseFeature(self, featureName):
        return featureName in self._pulseFeaturesAvailable

    def pulseFeaturesAvailable(self):
        return self._pulseFeaturesAvailable

    def hasPulseFeatures(self):
        """
        Is this BAM file a product of running analysis with the
        PacBio-internal analysis mode enabled?
        """
        return self.hasPulseFeature("PulseCall")

    @property
    def barcode(self):
        raise Unimplemented()

    @property
    def barcodeName(self):
        raise Unimplemented()

    @property
    def barcodes(self):
        raise Unimplemented()

    @requiresBai
    def __len__(self):
        return self.peer.mapped + self.peer.unmapped

    def close(self):
        if hasattr(self, "file") and self.file is not None:
            self.file.close()
            self.file = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
Ejemplo n.º 41
0
def _read_bam_frag(inbam,
                   filter_exclude,
                   all_bins,
                   sections1,
                   sections2,
                   rand_hash,
                   resolution,
                   tmpdir,
                   region,
                   start,
                   end,
                   half=False,
                   sum_columns=False):
    bamfile = AlignmentFile(inbam, 'rb')
    refs = bamfile.references
    bam_start = start - 2
    bam_start = max(0, bam_start)
    try:
        dico = {}
        for r in bamfile.fetch(
                region=region,
                start=bam_start,
                end=end,  # coords starts at 0
                multiple_iterators=True):
            if r.flag & filter_exclude:
                continue
            crm1 = r.reference_name
            pos1 = r.reference_start + 1
            crm2 = refs[r.mrnm]
            pos2 = r.mpos + 1
            try:
                pos1 = sections1[(crm1, pos1 // resolution)]
                pos2 = sections2[(crm2, pos2 // resolution)]
            except KeyError:
                continue  # not in the subset matrix we want
            crm = crm1 * (crm1 == crm2)
            try:
                dico[(crm, pos1, pos2)] += 1
            except KeyError:
                dico[(crm, pos1, pos2)] = 1
            # print '%-50s %5s %9s %5s %9s' % (r.query_name,
            #                                  crm1, r.reference_start + 1,
            #                                  crm2, r.mpos + 1)
        if half:
            for c, i, j in dico:
                if i < j:
                    del dico[(c, i, j)]
        out = open(
            os.path.join(tmpdir, '_tmp_%s' % (rand_hash),
                         '%s:%d-%d.tsv' % (region, start, end)), 'w')
        out.write(''.join('%s\t%d\t%d\t%d\n' % (c, a, b, v)
                          for (c, a, b), v in dico.items()))
        out.close()
        if sum_columns:
            sumcol = {}
            cisprc = {}
            for (c, i, j), v in dico.items():
                # out.write('%d\t%d\t%d\n' % (i, j, v))
                try:
                    sumcol[i] += v
                    cisprc[i][all_bins[i][0] == all_bins[j][0]] += v
                except KeyError:
                    sumcol[i] = v
                    cisprc[i] = [0, 0]
                    cisprc[i][all_bins[i][0] == all_bins[j][0]] += v
            return sumcol, cisprc
    except Exception as e:
        exc_type, exc_obj, exc_tb = exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(e)
        print(exc_type, fname, exc_tb.tb_lineno)
Ejemplo n.º 42
0
def load_hic_data_from_bam(fnam,
                           resolution,
                           biases=None,
                           tmpdir='.',
                           ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None,
                           verbose=True,
                           clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in zip(
        bam.references, [x / resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    for crm in genome_seq:
        len_crm = genome_seq[crm]
        sections.extend([(crm, i) for i in xrange(len_crm)])

    size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((),
                   size,
                   chromosomes=chromosomes,
                   dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' %
                            (biases['resolution'], resolution))
        if region:
            chrom_start = 0
            for crm in genome_seq:
                if crm == region:
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bads = dict((b - chrom_start, biases['badcol'][b])
                            for b in biases['badcol'])
            imx.bias = dict((b - chrom_start, biases['biases'][b])
                            for b in biases['biases'])
        else:
            imx.bads = biases['badcol']
            imx.bias = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam,
               resolution,
               biases=None,
               filter_exclude=filter_exclude,
               normalization='raw',
               tmpdir=tmpdir,
               clean=clean,
               ncpus=ncpus,
               dico=imx,
               region1=region,
               verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx
Ejemplo n.º 43
0
    def test_downgrade_read_edges(self):
        # With softclip
        bam_fpath = os.path.join(TEST_DATA_DIR, 'sample.bam')
        sam = AlignmentFile(bam_fpath)

        aligned_read = sam.next()
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        res = [
            9, 9, 9, 9, 9, 9, 3, 9, 8, 8, 9, 9, 9, 9, 9, 39, 39, 39, 38, 38,
            36, 33, 36, 38, 36, 38, 38, 38, 38, 39, 39, 38, 38, 38, 9, 9, 9, 9
        ]
        assert list(aligned_read.query_qualities) == res

        # without softclip
        sam = AlignmentFile(os.path.join(TEST_DATA_DIR, 'seqs.bam'))

        aligned_read = sam.next()
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        expected = [
            11, 13, 11, 11, 37, 43, 43, 46, 46, 57, 57, 48, 57, 57, 42, 41, 32,
            35, 38, 38, 38, 38, 41, 41, 39, 37, 37, 44, 42, 48, 47, 57, 47, 47,
            48, 47, 57, 57, 54, 48, 57, 48, 54, 50, 50, 50, 50, 50, 57, 59, 54,
            54, 54, 57, 57, 59, 57, 52, 52, 52, 52, 57, 57, 57, 57, 52, 52, 52,
            52, 29, 27, 27, 22
        ]

        assert list(aligned_read.query_qualities) == expected

        # reverse
        # rev seqs (sam specification puts all the alignment query
        # forward(cigar, seq, qual, ...). Reverse is inly noted in the flag
        bam_fpath = os.path.join(TEST_DATA_DIR, 'sample_rev.bam')
        sam = AlignmentFile(bam_fpath)
        aligned_read = sam.next()
        aligned_read = sam.next()
        aligned_read = sam.next()
        original_qual = aligned_read.query_qualities
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        res = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0]
        assert list(aligned_read.query_qualities[:14]) == res

        # check that we can restore the cuals from the tag
        _restore_qual_from_tag(aligned_read)
        assert original_qual == aligned_read.query_qualities

        # only restore left quals
        aligned_read = sam.next()
        original_qual = aligned_read.query_qualities
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        aligned_read.set_tag(RIGTH_DOWNGRADED_TAG, None)
        changed_rquals = aligned_read.query_qualities[-5:]
        _restore_qual_from_tag(aligned_read)
        assert aligned_read.query_qualities[-5:] == changed_rquals
        assert aligned_read.query_qualities[:10] == original_qual[:10]

        # only restore rigth quals
        sam = AlignmentFile(os.path.join(TEST_DATA_DIR, 'seqs.bam'))
        aligned_read = sam.next()
        original_qual = aligned_read.query_qualities
        _downgrade_edge_qualities(aligned_read, size=4, qual_to_substract=30)
        aligned_read.set_tag(LEFT_DOWNGRADED_TAG, None)
        changed_lquals = aligned_read.query_qualities[:5]
        _restore_qual_from_tag(aligned_read)
        assert aligned_read.query_qualities[:5] == changed_lquals
        assert aligned_read.query_qualities[10:] == original_qual[10:]
Ejemplo n.º 44
0
 def __init__(self, output, indexed_sequence_list, index_options):
     header = self.build_header(indexed_sequence_list, index_options)
     self.writer = AlignmentFile(output, 'wb', header=header)
Ejemplo n.º 45
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 46
0
def phase_structural_variants(sv_vcf, long_reads_bam, workdir):
    sv_vcf_basename = os.path.basename(sv_vcf)
    if sv_vcf_basename.endswith('.vcf'):
        offset = -4
    elif sv_vcf_basename.endswith('.vcf.gz'):
        offset = -7
    else:
        return

    sv_filtered_phased_vcf = workdir + '/' + sv_vcf_basename[:offset] + '.filtered.phased.vcf'
    vcf_in = VariantFile(sv_vcf)
    vcf_out = VariantFile(sv_filtered_phased_vcf, 'w', header=vcf_in.header)
    bam_in = AlignmentFile(long_reads_bam)
    phasing_stat_f = open(workdir + '/' + 'phasing_stat.txt', 'w')

    
    chr_to_include = ['1',
                      '2',
                      '3',
                      '4',
                      '5',
                      '6',
                      '7',
                      '8',
                      '9',
                      '10',
                      '11',
                      '12',
                      '13',
                      '14',
                      '15',
                      '16',
                      '17',
                      '18',
                      '19',
                      '20',
                      '21',
                      '22',
                      'X',
                      'Y']
    
    """
    chr_to_include = ['chr1',
                      'chr2',
                      'chr3',
                      'chr4',
                      'chr5',
                      'chr6',
                      'chr7',
                      'chr8',
                      'chr9',
                      'chr10',
                      'chr11',
                      'chr12',
                      'chr13',
                      'chr14',
                      'chr15',
                      'chr16',
                      'chr17',
                      'chr18',
                      'chr19',
                      'chr20',
                      'chr21',
                      'chr22',
                      'chrX',
                      'chrY']
    """

    phasing_stat = {'INS' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DEL' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'INV' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'BND' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DUP:TANDEM' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DUP_INT' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}}

    prev_chrom = ''
    for rec in vcf_in.fetch():
        sv_chrom = rec.chrom
        if sv_chrom in chr_to_include:
            if sv_chrom != prev_chrom:
                logging.info('Processing {0}'.format(sv_chrom))
            prev_chrom = sv_chrom
            if rec.filter.keys()[0] == 'PASS':
                sv_pos = rec.pos
                sv_read_ids = rec.info['READS']
                sv_support = rec.info['SUPPORT']
                sv_type = rec.info['SVTYPE']

                phasing_stat[sv_type]['Total'] += 1

                begin_pos = sv_pos - 1
                if 'END' in rec.info:
                    end_pos = rec.info['END']
                else:
                    end_pos = sv_pos

                hap1_counter = 0
                hap2_counter = 0
                try:
                    read_iterator = bam_in.fetch(sv_chrom, begin_pos-2000, end_pos+2000)
                except ValueError:
                    read_iterator = bam_in.fetch(sv_chrom, begin_pos, end_pos)
                for read in read_iterator:
                    if read.query_name in sv_read_ids:
                        if read.has_tag('HP'):
                            read_hp = read.get_tag('HP')
                            hap1_counter += read_hp == 1
                            hap2_counter += read_hp == 2

                threshold_read_count = max(int(0.85 * sv_support), 5)
                threshold_het = 0.8
                threshold_hom = 0.2

                if (hap1_counter + hap2_counter) >= threshold_read_count:
                    allele_frequency_hap1 = hap1_counter / float(hap1_counter + hap2_counter)
                    allele_frequency_hap2 = hap2_counter / float(hap1_counter + hap2_counter)

                    if allele_frequency_hap1 >= threshold_hom and allele_frequency_hap1 < threshold_het:
                        rec.samples[0]['GT'] = (1, 1)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HOM'] += 1
                    elif allele_frequency_hap1 >= threshold_het:
                        rec.samples[0]['GT'] = (1, 0)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HET'] += 1
                    elif allele_frequency_hap2 >= threshold_het:
                        rec.samples[0]['GT'] = (0, 1)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HET'] += 1

                    vcf_out.write(rec)
    
    phasing_stat_f.write('\tTotal\tPhased HOM\tPhased HET\n')
    for sv in phasing_stat:
        phasing_stat_f.write('{0}:\t{1}\t{2}\t{3}\n'.format(sv, phasing_stat[sv]['Total'], phasing_stat[sv]['Phased HOM'], phasing_stat[sv]['Phased HET']))
    phasing_stat_f.close()
Ejemplo n.º 47
0
    args = parser.parse_args()

    # Logger
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    if args.output_reads_2:  # Write all reads in a pair of files (R1 and R2)
        with FastqIO(args.output_reads_2, "w") as writer_r2:
            with FastqIO(args.output_reads, "w") as writer_r1:
                with AlignmentFile(args.input_aln, "rb",
                                   check_sq=False) as reader:
                    for curr_read in reader.fetch(until_eof=True):
                        if not curr_read.is_secondary and not curr_read.is_supplementary:
                            if args.keep_qc_failed or not curr_read.is_qcfail:
                                barcode = args.reads_barcode
                                if barcode is None and curr_read.has_tag(
                                        args.barcode_tag):
                                    barcode = curr_read.get_tag(
                                        args.barcode_tag).replace("-", "+")
                                description = "{}:{}:0:{} {}={}".format(
                                    "1" if curr_read.is_read1 else "2",
                                    "Y" if curr_read.is_qcfail else "N",
                                    "" if barcode is None else barcode,
                                    args.umi_qual_tag,
                                    curr_read.get_tag(args.umi_qual_tag))
                                read = Sequence(
Ejemplo n.º 48
0
def filter_reads(input_bam,
                 output_bam,
                 whitelist=None,
                 blacklist=None,
                 percentage=None,
                 count=None,
                 seed=None,
                 ignore_metadata=False,
                 relative=None,
                 anonymize=False,
                 use_barcodes=False):
    if output_bam is None:
        log.error("Must specify output file")
        return 1
    output_bam = op.abspath(output_bam)
    if not op.isdir(op.dirname(output_bam)):
        log.error("Output path '{d}' does not exist.".format(
            d=op.dirname(output_bam)))
        return 1
    n_specified = 4 - [whitelist, blacklist, percentage, count].count(None)
    if n_specified != 1:
        log.error("You must choose one and only one of the following " +
                  "options: --whitelist, --blacklist, --count, --percentage")
        return 1
    if seed is not None:
        random.seed(seed)
    if whitelist is None and blacklist is None:
        if not 0 < percentage < 100 and not count > 0:
            log.error("No reads selected for output.")
            return 1
    output_ds = None
    if output_bam.endswith(".xml"):
        if not input_bam.endswith(".xml"):
            print "DataSet output only supported for DataSet inputs."
            return 1
        ds_type = output_bam.split(".")[-2]
        ext2 = {
            "subreadset": "subreads",
            "alignmentset": "subreads",
            "consensusreadset": "ccs",
            "consensusalignmentset": "ccs"
        }
        if not ds_type in ext2:
            raise ValueError("Invalid dataset type 't'".format(t=ds_type))
        output_ds = output_bam
        output_bam = ".".join(
            output_ds.split(".")[:-2] + [ext2[ds_type], "bam"])
    if output_bam == input_bam:
        log.error("Input and output files must not be the same path")
        return 1
    elif not output_bam.endswith(".bam"):
        log.error("Output file name must end in either '.bam' or '.xml'")
        return 1
    n_file_reads = 0
    have_zmws = set()
    scraps_bam = barcode_set = None
    with openDataFile(input_bam) as ds_in:
        if not isinstance(ds_in, ReadSet):
            raise TypeError("{t} is not an allowed dataset type".format(
                t=type(ds_in).__name__))
        # TODO(nechols)(2016-03-11): refactor this to enable propagation of
        # filtered scraps
        if not ds_in.isIndexed:
            log.error("Input BAM must have accompanying .pbi index")
            return 1
        for ext_res in ds_in.externalResources:
            if ext_res.barcodes is not None:
                assert barcode_set is None or barcode_set == ext_res.barcodes
                barcode_set = barcode_set
        f1 = ds_in.resourceReaders()[0]
        if percentage is not None or count is not None:
            whitelist = _create_whitelist(ds_in, percentage, count)
        # convert these to Python sets
        _whitelist = _process_zmw_list(whitelist)
        _blacklist = _process_zmw_list(blacklist)
        scraps_in = None
        if output_ds is not None and output_ds.endswith(".subreadset.xml"):
            for ext_res in ds_in.externalResources:
                if ext_res.scraps is not None:
                    if use_barcodes:
                        log.warn("Scraps BAM is present but lacks " +
                                 "barcodes - will not be propagated " +
                                 "to output SubreadSet")
                    else:
                        scraps_in = IndexedBamReader(ext_res.scraps)
                    break
        with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out:
            for bam_in in ds_in.resourceReaders():
                n_records, have_zmws_ = _process_bam_whitelist(
                    bam_in,
                    bam_out,
                    whitelist=_whitelist,
                    blacklist=_blacklist,
                    use_barcodes=use_barcodes,
                    anonymize=anonymize)
                n_file_reads += n_records
                have_zmws.update(have_zmws_)
        if scraps_in is not None:
            scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam)
            with AlignmentFile(scraps_bam, 'wb',
                               template=scraps_in.peer) as scraps_out:
                for ext_res in ds_in.externalResources:
                    if ext_res.scraps is not None:
                        scraps_in_ = IndexedBamReader(ext_res.scraps)
                        n_records, have_zmws_ = _process_bam_whitelist(
                            scraps_in_,
                            scraps_out,
                            _whitelist,
                            _blacklist,
                            use_barcodes=use_barcodes,
                            anonymize=anonymize)
                        have_zmws.update(have_zmws_)
    if n_file_reads == 0:
        log.error("No reads written")
        return 1
    log.info("{n} records from {z} ZMWs written".format(n=n_file_reads,
                                                        z=len(have_zmws)))

    def _run_pbindex(bam_file):
        try:
            rc = subprocess.call(["pbindex", bam_file])
        except OSError as e:
            if e.errno == 2:
                log.warn("pbindex not present, will not create .pbi file")
            else:
                raise

    _run_pbindex(output_bam)
    if output_ds is not None:
        with openDataSet(input_bam) as ds_in:
            ds_out = ds_in.__class__(output_bam)
            if scraps_bam is not None:
                _run_pbindex(scraps_bam)
                ds_out.externalResources[0].scraps = scraps_bam
                # XXX it doesn't pick up the .pbi file - sort of annoying
                # but since the pbcore API doesn't provide a read for the
                # scraps automatically anyway, the impact is minimal
            if barcode_set is not None:
                ds_out.externalResources[0].barcodes = barcode_set
            if not ignore_metadata:
                ds_out.metadata = ds_in.metadata
                ds_out.updateCounts()
            if relative:
                ds_out.makePathsRelative(op.dirname(output_ds))
            ds_out.write(output_ds)
            log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__,
                                                   x=output_ds))
    return 0
Ejemplo n.º 49
0
class PaddedSAM(object):
    """
    Obtain aligned (padded) queries from a SAM/BAM file.

    @param filename: The C{str} name of the SAM/BAM file.
    """
    def __init__(self, filename):
        self.samfile = AlignmentFile(filename)
        # self.referenceInsertions will be keyed by offset into the reference
        # sequence. The inserted bases would need to begin at this offset. The
        # value will be a Counter whose keys are the nucleotides proposed for
        # insertion, with a value indicating how many times the nucleotide was
        # proposed for insertion at that offset.
        self.referenceInsertions = defaultdict(Counter)

    def close(self):
        """
        Close the opened SAM/BAM file.
        """
        self.samfile.close()

    def referencesToStr(self, indent=0):
        """
        List the reference names and their lengths.

        @param indent: An C{int} number of spaces to indent each line.
        @return: A C{str} describing known reference names and their lengths.
        """
        samfile = self.samfile
        result = []
        indent = ' ' * indent
        for i in range(samfile.nreferences):
            result.append('%s%s (length %d)' % (
                indent, samfile.get_reference_name(i), samfile.lengths[i]))
        return '\n'.join(result)

    def queries(self, referenceName=None, minLength=0, rcSuffix='',
                dropSecondary=False, dropSupplementary=False,
                dropDuplicates=False, allowDuplicateIds=False,
                keepQCFailures=False, rcNeeded=False, padChar='-',
                queryInsertionChar='N'):
        """
        Produce padded (with gaps) queries according to the CIGAR string and
        reference sequence length for each matching query sequence.

        @param referenceName: The C{str} name of the reference sequence to
            print alignments for. This is only needed if the SAM/BAM alignment
            was made against multiple references *and* they are not all of the
            same length. If there is only one reference sequence or if all
            reference sequences are of the same length, there is no need to
            provide a reference name (i.e., pass C{None}).
        @param minLength: Ignore queries shorter than this C{int} value. Note
            that this refers to the length of the query sequence once it has
            been aligned to the reference. The alignment may introduce
            C{queryInsertionChar} characters into the read, and these are
            counted towards its length because the alignment is assuming the
            query is missing a base at those locations.
        @param rcSuffix: A C{str} to add to the end of query names that are
            reverse complemented. This is added before the /1, /2, etc., that
            are added for duplicated ids (if there are duplicates and
            C{allowDuplicateIds} is C{False}.
        @param dropSecondary: If C{True}, secondary matches will not be
            yielded.
        @param dropSupplementary: If C{True}, supplementary matches will not be
            yielded.
        @param dropDuplicates: If C{True}, matches flagged as optical or PCR
            duplicates will not be yielded.
        @param allowDuplicateIds: If C{True}, repeated query ids (due to
            secondary or supplemental matches) will not have /1, /2, etc.
            appended to their ids. So repeated ids may appear in the yielded
            FASTA.
        @param keepQCFailures: If C{True}, reads that are marked as quality
            control failures will be included in the output.
        @param rcNeeded: If C{True}, queries that are flagged as matching when
            reverse complemented should have reverse complementing when
            preparing the output sequences. This must be used if the program
            that created the SAM/BAM input flags reversed matches but does not
            also store the reverse complemented query.
        @param padChar: A C{str} of length one to use to pad queries with to
            make them the same length as the reference sequence.
        @param queryInsertionChar:  A C{str} of length one to use to insert
            into queries when the CIGAR string indicates that the alignment
            of a query would cause a deletion in the reference. This character
            is inserted as a 'missing' query character (i.e., a base that can
            be assumed to have been lost due to an error) whose existence is
            necessary for the match to continue.
        @raises UnequalReferenceLengthError: If C{referenceName} is C{None}
            and the reference sequence lengths in the SAM/BAM file are not all
            identical.
        @raises UnknownReference: If C{referenceName} does not exist.
        @return: A generator that yields C{Read} instances that are padded
            with gap characters to align them to the length of the reference
            sequence.
        """
        samfile = self.samfile

        if referenceName:
            referenceId = samfile.get_tid(referenceName)
            if referenceId == -1:
                raise UnknownReference(
                    'Reference %r is not present in the SAM/BAM file.'
                    % referenceName)
            referenceLength = samfile.lengths[referenceId]
        else:
            # No reference given. All references must have the same length.
            if len(set(samfile.lengths)) != 1:
                raise UnequalReferenceLengthError(
                    'Your SAM/BAM file has %d reference sequences, and their '
                    'lengths (%s) are not all identical.' % (
                        samfile.nreferences,
                        ', '.join(map(str, sorted(samfile.lengths)))))
            referenceId = None
            referenceLength = samfile.lengths[0]

        # Hold the count for each id so we can add /1, /2 etc to duplicate
        # ids (unless --allowDuplicateIds was given).
        idCount = Counter()

        MATCH_OPERATIONS = {CMATCH, CEQUAL, CDIFF}

        for read in samfile.fetch():
            query = read.query_sequence
            if (read.is_unmapped or
                    (read.is_secondary and dropSecondary) or
                    (read.is_supplementary and dropSupplementary) or
                    (read.is_duplicate and dropDuplicates) or
                    (read.is_qcfail and not keepQCFailures) or
                    (referenceId is not None and
                     read.reference_id != referenceId)):
                continue

            if read.is_reverse:
                if rcNeeded:
                    query = DNARead('id', query).reverseComplement().sequence
                if rcSuffix:
                    read.query_name += rcSuffix

            referenceStart = read.reference_start
            atStart = True
            queryIndex = 0
            referenceIndex = referenceStart
            alignedSequence = ''

            for operation, length in read.cigartuples:

                # The operations are tested in the order they appear in
                # https://samtools.github.io/hts-specs/SAMv1.pdf It would be
                # more efficient to test them in order of frequency of
                # occurrence.
                if operation in MATCH_OPERATIONS:
                    atStart = False
                    alignedSequence += query[queryIndex:queryIndex + length]
                elif operation == CINS:
                    # Insertion to the reference. This consumes query bases but
                    # we don't output them because the reference cannot be
                    # changed.  I.e., these bases in the query would need to be
                    # inserted into the reference.  Remove these bases from the
                    # query but record what would have been inserted into the
                    # reference.
                    atStart = False
                    for i in range(length):
                        self.referenceInsertions[referenceIndex + i][
                            query[queryIndex + i]] += 1
                elif operation == CDEL:
                    # Delete from the reference. Some bases from the reference
                    # would need to be deleted to continue the match. So we put
                    # an insertion into the query to compensate.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                elif operation == CREF_SKIP:
                    # Skipped reference. Opens a gap in the query. For
                    # mRNA-to-genome alignment, an N operation represents an
                    # intron.  For other types of alignments, the
                    # interpretation of N is not defined. So this is unlikely
                    # to occur.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                elif operation == CSOFT_CLIP:
                    # Bases in the query that are not part of the match. We
                    # remove these from the query if they protrude before the
                    # start or after the end of the reference. According to the
                    # SAM docs, 'S' operations may only have 'H' operations
                    # between them and the ends of the CIGAR string.
                    if atStart:
                        # Don't set atStart=False, in case there's another 'S'
                        # operation.
                        unwantedLeft = length - referenceStart
                        if unwantedLeft > 0:
                            # The query protrudes left. Copy its right part.
                            alignedSequence += query[queryIndex + unwantedLeft:
                                                     queryIndex + length]
                            referenceStart = 0
                        else:
                            referenceStart -= length
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                    else:
                        unwantedRight = (
                            (referenceStart + len(alignedSequence) + length) -
                            referenceLength)

                        if unwantedRight > 0:
                            # The query protrudes right. Copy its left part.
                            alignedSequence += query[
                                queryIndex:queryIndex + length - unwantedRight]
                        else:
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                elif operation == CHARD_CLIP:
                    # Some bases have been completely removed from the query.
                    # This (H) can only be present as the first and/or last
                    # operation. There is nothing to do as the bases are simply
                    # not present in the query string in the SAM/BAM file.
                    pass
                elif operation == CPAD:
                    # This is "silent deletion from the padded reference",
                    # which consumes neither query nor reference.
                    atStart = False
                else:
                    raise ValueError('Unknown CIGAR operation:', operation)

                if operation in _CONSUMES_QUERY:
                    queryIndex += length

                if operation in _CONSUMES_REFERENCE:
                    referenceIndex += length

            # Sanity check that we consumed the entire query.
            assert queryIndex == len(query)

            # We cannot test we consumed the entire reference.  The CIGAR
            # string applies to (i.e., exhausts) the query and is silent about
            # the part of the reference that to the right of the aligned query.

            # Check the length restriction now that we have (possibly) added
            # queryInsertionChar characters to pad the query out to the length
            # it requires to match the reference.
            if len(alignedSequence) < minLength:
                continue

            # Put gap characters before and after the aligned sequence so that
            # it is offset properly and matches the length of the reference.
            paddedSequence = (
                (padChar * referenceStart) +
                alignedSequence +
                padChar * (referenceLength -
                           (referenceStart + len(alignedSequence))))

            if allowDuplicateIds:
                suffix = ''
            else:
                count = idCount[read.query_name]
                idCount[read.query_name] += 1
                suffix = '' if count == 0 else '/%d' % count

            yield Read('%s%s' % (read.query_name, suffix), paddedSequence)
Ejemplo n.º 50
0
Archivo: umis.py Proyecto: vals/umis
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse,
             parse_tags, gene_tags):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t")
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample))
        start_sampling  = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    targets = [x["SN"] for x in sam_file.header["SQ"]]
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    missing_transcripts = set()
    for i, aln in enumerate(track):
        if count and not count % 1000000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info("%d were filtered for not matching known barcodes."
                            % nomatchcb)
        count += 1

        if aln.is_unmapped:
            unmapped += 1
            continue

        if gene_tags and not aln.has_tag('GX'):
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        if parse_tags:
            CB = aln.get_tag('CR')
        else:
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        if parse_tags:
            MB = aln.get_tag('UM')
        else:
            MB = match.group('MB')

        if gene_tags:
            target_name = aln.get_tag('GX').split(',')[0]
        else:
            txid = sam_file.getrname(aln.reference_id)
            if gene_map:
                if txid in gene_map:
                    target_name = gene_map[txid]
                else:
                    missing_transcripts.add(txid)
                    target_name = txid
            else:
                target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)

        kept += 1

    tally_time = time.time() - start_tally
    if missing_transcripts:
        logger.warn('The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts))
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    logger.info('Writing evidence')
    with tempfile.NamedTemporaryFile('w+t') as out_handle:
        for key in evidence:
            line = '{},{}\n'.format(key, evidence[key])
            out_handle.write(line)

        out_handle.flush()
        out_handle.seek(0)

        evidence_table = pd.read_csv(out_handle, header=None)

    del evidence

    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    elif gene_tags:
        expanded.sort_index()
        genes = expanded

    else:
        # make data frame have a complete accounting of transcripts
        targets = pd.Series(index=set(targets))
        targets = targets.sort_index()
        expanded = expanded.reindex(targets.index.values, fill_value=0)
        genes = expanded

    genes.fillna(0, inplace=True)
    genes = genes.astype(int)
    genes.index.name = "gene"

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    if sparse:
        pd.Series(genes.index).to_csv(out + ".rownames", index=False, header=False)
        pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False, header=False)
        with open(out, "w+b") as out_handle:
            scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes))
    else:
        genes.to_csv(out)
Ejemplo n.º 51
0
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence,
             cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse,
             parse_tags, gene_tags):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram,
                                index_col=0,
                                header=-1,
                                squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(
            cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info(
            'Creating reservoir of subsampled reads ({} per cell)'.format(
                subsample))
        start_sampling = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(int)

    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    targets = [x["SN"] for x in sam_file.header["SQ"]]
    track = sam_file.fetch(until_eof=True)
    count = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    count_this_read = True
    missing_transcripts = set()
    for i, aln in enumerate(track):
        if count and not count % 1000000:
            logger.info("Processed %d alignments, kept %d." % (count, kept))
            logger.info("%d were filtered for being unmapped." % unmapped)
            if filter_cb:
                logger.info(
                    "%d were filtered for not matching known barcodes." %
                    nomatchcb)
        count += 1

        if aln.is_unmapped:
            unmapped += 1
            continue

        if gene_tags and not aln.has_tag('GX'):
            unmapped += 1
            continue

        if aln.qname != current_read:
            current_read = aln.qname
            if subsample and i not in index_filter:
                count_this_read = False
                continue
            else:
                count_this_read = True
        else:
            if not count_this_read:
                continue

        if parse_tags:
            CB = aln.get_tag('CR')
        else:
            match = parser_re.match(aln.qname)
            CB = match.group('CB')

        if filter_cb:
            if CB not in cb_hist.index:
                nomatchcb += 1
                continue

        if parse_tags:
            MB = aln.get_tag('UM')
        else:
            MB = match.group('MB')

        if gene_tags:
            target_name = aln.get_tag('GX').split(',')[0]
        else:
            txid = sam_file.getrname(aln.reference_id)
            if gene_map:
                if txid in gene_map:
                    target_name = gene_map[txid]
                else:
                    missing_transcripts.add(txid)
                    target_name = txid
            else:
                target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        if no_scale_evidence:
            evidence[e_tuple] += 1.0
        else:
            evidence[e_tuple] += weigh_evidence(aln.tags)

        kept += 1

    tally_time = time.time() - start_tally
    if missing_transcripts:
        logger.warn(
            'The following transcripts were missing gene_ids, so we added them as the transcript ids: %s'
            % str(missing_transcripts))
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(
        tally_time, int(60. * count / tally_time)))
    logger.info('Collapsing evidence')

    logger.info('Writing evidence')
    with tempfile.NamedTemporaryFile('w+t') as out_handle:
        for key in evidence:
            line = '{},{}\n'.format(key, evidence[key])
            out_handle.write(line)

        out_handle.flush()
        out_handle.seek(0)

        evidence_table = pd.read_csv(out_handle, header=None)

    del evidence

    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns = ['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(
            ['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    elif gene_tags:
        expanded.sort_index()
        genes = expanded

    else:
        # make data frame have a complete accounting of transcripts
        targets = pd.Series(index=set(targets))
        targets = targets.sort_index()
        expanded = expanded.reindex(targets.index.values, fill_value=0)
        genes = expanded

    genes.fillna(0, inplace=True)
    genes = genes.astype(int)
    genes.index.name = "gene"

    logger.info('Output results')

    if subsample:
        cb_hist_sampled.to_csv('ss_{}_'.format(subsample) +
                               os.path.basename(cb_histogram),
                               sep='\t')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    if sparse:
        pd.Series(genes.index).to_csv(out + ".rownames", index=False)
        pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False)
        with open(out, "w+b") as out_handle:
            scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes))
    else:
        genes.to_csv(out)
Ejemplo n.º 52
0
def species_pileup(species_id, args, tempdir, outputdir, contig_file,
                   contigs_db_stats):
    # Read in contigs information for current species_id

    contigs = {}
    contigs_db_stats[
        'species_counts'] += 1  # not being updated and passed as expected

    with InputStream(contig_file) as file:
        for rec in Bio.SeqIO.parse(file, 'fasta'):
            contigs[rec.id] = {
                "species_id": species_id,
                "contig_len": int(len(rec.seq)),
                "contig_seq": str(rec.seq),
            }
            contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"]
            contigs_db_stats['total_seqs'] += 1

    # Summary statistics
    aln_stats = {
        "genome_length": 0,
        "total_depth": 0,
        "covered_bases": 0,
        "aligned_reads": 0,
        "mapped_reads": 0,
    }

    def keep_read(x):
        return keep_read_worker(x, args, aln_stats)

    header = [
        'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c',
        'count_g', 'count_t'
    ]
    path = f"{outputdir}/{species_id}.snps.lz4"

    with OutputStream(path) as file:

        file.write('\t'.join(header) + '\n')
        zero_rows_allowed = not args.sparse

        # Loop over alignment for current species's contigs
        with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile:
            for contig_id in sorted(list(contigs.keys())):  # why need to sort?
                contig = contigs[contig_id]
                counts = bamfile.count_coverage(
                    contig_id,
                    start=0,
                    end=contig["contig_len"],
                    quality_threshold=args.aln_baseq,
                    read_callback=keep_read)

                for ref_pos in range(0, contig["contig_len"]):
                    ref_allele = contig["contig_seq"][ref_pos]
                    depth = sum([counts[nt][ref_pos] for nt in range(4)])
                    count_a = counts[0][ref_pos]
                    count_c = counts[1][ref_pos]
                    count_g = counts[2][ref_pos]
                    count_t = counts[3][ref_pos]
                    values = [
                        contig_id, ref_pos + 1, ref_allele, depth, count_a,
                        count_c, count_g, count_t
                    ]

                    if depth > 0 or zero_rows_allowed:
                        file.write('\t'.join(str(val)
                                             for val in values) + '\n')

                    aln_stats['genome_length'] += 1
                    aln_stats['total_depth'] += depth
                    if depth > 0:
                        aln_stats['covered_bases'] += 1

    tsprint(json.dumps({species_id: aln_stats}, indent=4))
    return (species_id, {k: str(v) for k, v in aln_stats.items()})
Ejemplo n.º 53
0
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram,
                 cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix):
    ''' Count up evidence for tagged molecules, this implementation assumes the
    alignment file is coordinate sorted
    '''
    from pysam import AlignmentFile

    from io import StringIO
    import pandas as pd

    from utils import weigh_evidence

    if sam.endswith(".sam"):
        logger.error(
            "To use the fasttagcount subcommand, the alignment file must be a "
            "coordinate sorted, indexed BAM file.")
        sys.exit(1)

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            try:
                gene_map = dict(p.strip().split() for p in fh)
            except ValueError:
                logger.error('Incorrectly formatted gene_map, need to be tsv.')
                sys.exit()

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    if not cb_cutoff:
        cb_cutoff = 0

    if cb_histogram and cb_cutoff == "auto":
        cb_cutoff = guess_depth_cutoff(cb_histogram)

    cb_cutoff = int(cb_cutoff)

    cb_hist = None
    filter_cb = False
    if cb_histogram:
        cb_hist = pd.read_table(cb_histogram,
                                index_col=0,
                                header=-1,
                                squeeze=True)
        total_num_cbs = cb_hist.shape[0]
        cb_hist = cb_hist[cb_hist > cb_cutoff]
        logger.info('Keeping {} out of {} cellular barcodes.'.format(
            cb_hist.shape[0], total_num_cbs))
        filter_cb = True

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    if subsample:
        logger.info(
            'Creating reservoir of subsampled reads ({} per cell)'.format(
                subsample))
        start_sampling = time.time()

        reservoir = collections.defaultdict(list)
        cb_hist_sampled = 0 * cb_hist
        cb_obs = 0 * cb_hist

        track = stream_bamfile(sam)
        current_read = 'none_observed_yet'
        for i, aln in enumerate(track):
            if aln.qname == current_read:
                continue

            current_read = aln.qname

            if parse_tags:
                CB = aln.get_tag('CR')
            else:
                match = parser_re.match(aln.qname)
                CB = match.group('CB')

            if CB not in cb_hist.index:
                continue

            cb_obs[CB] += 1
            if len(reservoir[CB]) < subsample:
                reservoir[CB].append(i)
                cb_hist_sampled[CB] += 1
            else:
                s = pd.np.random.randint(0, cb_obs[CB])
                if s < subsample:
                    reservoir[CB][s] = i

        index_filter = set(itertools.chain.from_iterable(reservoir.values()))
        sam_file.close()
        sampling_time = time.time() - start_sampling
        logger.info('Sampling done - {:.3}s'.format(sampling_time))

    evidence = collections.defaultdict(lambda: collections.defaultdict(float))
    bare_evidence = collections.defaultdict(float)
    logger.info('Tallying evidence')
    start_tally = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    transcript_map = collections.defaultdict(set)
    sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]]
    if gene_map:
        for transcript, gene in gene_map.items():
            if transcript in sam_transcripts:
                transcript_map[gene].add(transcript)
    else:
        for transcript in sam_transcripts:
            transcript_map[transcript].add(transcript)
    missing_transcripts = set()
    alignments_processed = 0
    unmapped = 0
    kept = 0
    nomatchcb = 0
    current_read = 'none_observed_yet'
    current_transcript = None
    count_this_read = True
    transcripts_processed = 0
    genes_processed = 0
    cells = list(cb_hist.index)
    targets_seen = set()

    if umi_matrix:
        bare_evidence_handle = open(umi_matrix, "w")
        bare_evidence_handle.write(",".join(["gene"] + cells) + "\n")

    with open(out, "w") as out_handle:
        out_handle.write(",".join(["gene"] + cells) + "\n")
        for gene, transcripts in transcript_map.items():
            for transcript in transcripts:
                for aln in sam_file.fetch(transcript):
                    alignments_processed += 1

                    if aln.is_unmapped:
                        unmapped += 1
                        continue

                    if gene_tags and not aln.has_tag('GX'):
                        unmapped += 1
                        continue

                    if aln.qname != current_read:
                        current_read = aln.qname
                        if subsample and i not in index_filter:
                            count_this_read = False
                            continue
                        else:
                            count_this_read = True
                    else:
                        if not count_this_read:
                            continue

                    if parse_tags:
                        CB = aln.get_tag('CR')
                    else:
                        match = parser_re.match(aln.qname)
                        CB = match.group('CB')

                    if filter_cb:
                        if CB not in cb_hist.index:
                            nomatchcb += 1
                            continue

                    if parse_tags:
                        MB = aln.get_tag('UM')
                    else:
                        MB = match.group('MB')

                    if gene_tags:
                        target_name = aln.get_tag('GX').split(',')[0]
                    else:
                        txid = sam_file.getrname(aln.reference_id)
                        if gene_map:
                            if txid in gene_map:
                                target_name = gene_map[txid]
                            else:
                                missing_transcripts.add(txid)
                                continue
                        else:
                            target_name = txid
                    targets_seen.add(target_name)

                    # Scale evidence by number of hits
                    evidence[CB][MB] += weigh_evidence(aln.tags)
                    bare_evidence[CB] += weigh_evidence(aln.tags)
                    kept += 1
                transcripts_processed += 1
                if not transcripts_processed % 1000:
                    logger.info("%d genes processed." % genes_processed)
                    logger.info("%d transcripts processed." %
                                transcripts_processed)
                    logger.info("%d alignments processed." %
                                alignments_processed)

            earray = []
            for cell in cells:
                umis = [
                    1 for _, v in evidence[cell].items() if v >= minevidence
                ]
                earray.append(str(sum(umis)))
            out_handle.write(",".join([gene] + earray) + "\n")
            earray = []
            if umi_matrix:
                for cell in cells:
                    earray.append(str(int(bare_evidence[cell])))
                bare_evidence_handle.write(",".join([gene] + earray) + "\n")

            evidence = collections.defaultdict(
                lambda: collections.defaultdict(int))
            bare_evidence = collections.defaultdict(int)
            genes_processed += 1

    if umi_matrix:
        bare_evidence_handle.close()

    # fill dataframe with missing values, sort and output
    df = pd.read_csv(out, index_col=0, header=0)
    targets = pd.Series(index=set(transcript_map.keys()))
    targets = targets.sort_index()
    df = df.reindex(targets.index.values, fill_value=0)
    df = df.sort_index()
    df.to_csv(out)

    if umi_matrix:
        df = pd.read_csv(umi_matrix, index_col=0, header=0)
        df = df.reindex(targets.index.values, fill_value=0)
        df = df.sort_index()
        df.to_csv(umi_matrix)
Ejemplo n.º 54
0
from pysam import AlignmentFile
from pyfaidx import Fasta

def has_mismatch_in_interval(reference, bamfile, chrom, start, end):
    """
    Return whether there is a mismatch in the interval (start, end) in any read mapping to the given chromosome.

    reference -- a pyfaidx.Fasta object or something that behaves similarly
    """
    for column in bamfile.pileup(chrom, start, end):
        refbase = reference[chrom][column.pos:column.pos+1]
        for piledup in column.pileups:
            if piledup.indel != 0:  # Insertion is positive; deletion is negative
                # Ignore indels
                continue
            querybase = piledup.alignment.query_sequence[piledup.query_position]
            if refbase != querybase:
                # Mismatch
                return True
    return False


ref = Fasta('reference.fasta')
bamfile = AlignmentFile('mappedreads.bam')
has_mismatch_in_interval(ref, bamfile, 'scaffold17', 1000, 2000)
Ejemplo n.º 55
0
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence):
    ''' Count up evidence for tagged molecules
    '''
    from pysam import AlignmentFile
    from cStringIO import StringIO
    import pandas as pd

    from utils import weigh_evidence

    logger.info('Reading optional files')

    gene_map = None
    if genemap:
        with open(genemap) as fh:
            gene_map = dict(p.strip().split() for p in fh)

    if positional:
        tuple_template = '{0},{1},{2},{3}'
    else:
        tuple_template = '{0},{1},{3}'

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    logger.info('Tallying evidence')
    start_tally = time.time()

    evidence = collections.defaultdict(int)

    sam_file = AlignmentFile(sam, mode='r')
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        if aln.is_unmapped:
            continue

        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        MB = match.group('MB')

        txid = sam_file.getrname(aln.reference_id)
        if gene_map:
            target_name = gene_map[txid]

        else:
            target_name = txid

        e_tuple = tuple_template.format(CB, target_name, aln.pos, MB)

        # Scale evidence by number of hits
        evidence[e_tuple] += weigh_evidence(aln.tags)

    tally_time = time.time() - start_tally
    logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * i / tally_time)))
    logger.info('Collapsing evidence')

    buf = StringIO()
    for key in evidence:
        line = '{},{}\n'.format(key, evidence[key])
        buf.write(line)

    buf.seek(0)
    evidence_table = pd.read_csv(buf)
    evidence_query = 'evidence >= %f' % minevidence
    if positional:
        evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size()

    else:
        evidence_table.columns=['cell', 'gene', 'umi', 'evidence']
        collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size()

    expanded = collapsed.unstack().T

    if gene_map:
        # This Series is just for sorting the index
        genes = pd.Series(index=set(gene_map.values()))
        genes = genes.sort_index()
        # Now genes is assigned to a DataFrame
        genes = expanded.ix[genes.index]

    else:
        genes = expanded

    genes.replace(pd.np.nan, 0, inplace=True)

    logger.info('Output results')

    if output_evidence_table:
        import shutil
        buf.seek(0)
        with open(output_evidence_table, 'w') as etab_fh:
            shutil.copyfileobj(buf, etab_fh)

    genes.to_csv(out)
Ejemplo n.º 56
0
def load_hic_data_from_bam(fnam, resolution, biases=None, tmpdir='.', ncpus=8,
                           filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                           region=None, verbose=True, clean=True):
    """
    :param fnam: TADbit-generated BAM file with read-ends1 and read-ends2
    :param resolution: the resolution of the experiment (size of a bin in
       bases)
    :param None biases: path to pickle file where are stored the biases. Keys
       in this file should be: 'biases', 'badcol', 'decay' and 'resolution'
    :param '.' tmpdir: path to folder where to create temporary files
    :param 8 ncpus:
    :param (1, 2, 3, 4, 6, 7, 8, 9, 10) filter exclude: filters to define the
       set of valid pair of reads.
    :param None region: chromosome name, if None, all genome will be loaded

    :returns: HiC_data object
    """
    bam = AlignmentFile(fnam)
    genome_seq = OrderedDict((c, l) for c, l in
                             zip(bam.references,
                                 [x / resolution + 1 for x in bam.lengths]))
    bam.close()

    sections = []
    for crm in genome_seq:
        len_crm = genome_seq[crm]
        sections.extend([(crm, i) for i in xrange(len_crm)])

    size = sum(genome_seq.values())

    chromosomes = {region: genome_seq[region]} if region else genome_seq
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    imx = HiC_data((), size, chromosomes=chromosomes, dict_sec=dict_sec,
                   resolution=resolution)

    if biases:
        if isinstance(biases, basestring):
            biases = load(open(biases))
        if biases['resolution'] != resolution:
            raise Exception('ERROR: resolution of biases do not match to the '
                            'one wanted (%d vs %d)' % (
                                biases['resolution'], resolution))
        if region:
            chrom_start = 0
            for crm in genome_seq:
                if crm == region:
                    break
                len_crm = genome_seq[crm]
                chrom_start += len_crm
            imx.bads     = dict((b - chrom_start, biases['badcol'][b]) for b in biases['badcol'])
            imx.bias     = dict((b - chrom_start, biases['biases'][b]) for b in biases['biases'])
        else:
            imx.bads     = biases['badcol']
            imx.bias     = biases['biases']
        imx.expected = biases['decay']

    get_matrix(fnam, resolution, biases=None, filter_exclude=filter_exclude,
               normalization='raw', tmpdir=tmpdir, clean=clean,
               ncpus=ncpus, dico=imx, region1=region, verbose=verbose)
    imx._symmetricize()
    imx.symmetricized = True

    return imx