Beispiel #1
0
def parse_bam(bam_path):
    '''
    Returns alignment information for each reference sequence as an OrderedDict
    '''
    alignments = OrderedDict()
    with open(bam_path, 'r') as bam_fh:
        bam = simplesam.Reader(bam_fh)
        refs_lens = {
            n.replace('SN:', ''): int(l[0].replace('LN:', ''))
            for n, l in bam.header['@SQ'].items()
        }

        refs_records = defaultdict(list)
        for r in bam:
            for id in refs_lens:
                refs_records[r.rname].append(r)

        if '*' in refs_records:
            del refs_records['*']

    # assert len(refs_records) <= 1, 'Detected primary mappings to more than one reference'
    # Use samtools view to extract single contig primary mappings
    # Otherwise would make a useful enhancement

    for ref_id, records in refs_records.items():
        alignments[ref_id] = parse_records(ref_id, refs_lens[ref_id], records)

    return alignments
def processRegion(R):
    global fname
    r = R[0] + ":" + str(R[1]) + "-" + str(R[1] + 1)
    with open(fname, 'rb') as filenameopen:
        samfile = simplesam.Reader(filenameopen, regions=r)

        S = [R[1], [], [], [], [], [], [], []]
        while True:
            try:  #get next read
                read = samfile.next()
                if read.duplicate or not read.passing or read.secondary or not read.mapped:
                    continue
                #indel at site in question?
                (clip, ind, HP, ASXS) = readFeatures(read)
                gappedSeq = read.gapped('seq')
                try:
                    S[1].append(gappedSeq[R[1] - read.pos])
                except IndexError:
                    sys.stderr.write(str(R[1]) + "\t" + str(read.pos) + "\n")
                    return None
                S[2].append(ord(read.gapped('qual')[R[1] - read.pos]) - 33)
                S[3].append(min((len(gappedSeq) - R[1]),
                                R[1]))  #not exact due to indel
                S[4].append(clip)
                S[5].append(ind)
                S[6].append(HP)
                S[7].append(ASXS)

            except StopIteration:  #no more reads
                samfile.close()
                samfile.p.wait()  #Prevent Z-status samtools processes
                global simMode
                return siteFeatures(S, R[2], simMode)
Beispiel #3
0
def parse_bam(bam_path):
    '''
    Returns alignment information for each reference sequence as an OrderedDict
    '''
    alignments = OrderedDict()
    with open(bam_path, 'r') as bam_fh:
        bam = simplesam.Reader(bam_fh)
        refs_lens = {
            n.replace('SN:', ''): int(l[0].replace('LN:', ''))
            for n, l in bam.header['@SQ'].items()
        }
        refs_records = {
            id: (r for r in bam if r.rname == id)
            for id in refs_lens
        }
        for ref_id, records in refs_records.items():
            alignments[ref_id] = parse_records(ref_id, refs_lens[ref_id],
                                               records)
    return alignments
Beispiel #4
0
def main():
    parser = ArgumentParser(description = "Utility to split a tagged BAM/SAM " + \
            "file into separate SAM files for different barcodes.")
    parser.add_argument("input", help = "BAM/SAM tagged file")
    parser.add_argument("outdir", help = "Directory where to store the splitted SAM files")
    parser.add_argument("-t", "--tag-name", default="XC", help = "TAG name (default=XC)")
    parser.add_argument("-b", "--barcodes-file", help = "Selection of cell barcodes (default: all barcodes in BAM/SAM file)")
    args = parser.parse_args()

    mkdir(args.outdir)
    # user provided a subset of barcodes (create dictionary and set a flag)
    if args.barcodes_file:
        barcodes = dict.fromkeys(open(args.barcodes_file).read().split(), 0)
        subset_barcodes = True
    else:
        barcodes = dict()
        subset_barcodes = False

    in_samfile = open(args.input, "r")
    in_sam = simplesam.Reader(in_samfile)

    print("Selected TAG name: {name}".format(name=args.tag_name), file=stderr)
    info("Analyzing file {file}".format(file=args.input))

    selected_reads = 0
    for tot_reads, read in enumerate(in_sam):
        if (tot_reads + 1) % 1e5 == 0:
            info("Reads (selected/total): {s}/{t}".format(s=str(selected_reads+1), t=str(tot_reads+1)))

        cell_bc = read.tags.get(args.tag_name)
        if (cell_bc and subset_barcodes and (cell_bc in barcodes)) or (cell_bc and not subset_barcodes):
            selected_reads += 1
            out_filename = join_path(args.outdir, cell_bc + ".sam")
            if not barcodes.get(cell_bc): # NB: both get zero and get null equal false
                barcodes[cell_bc] = barcodes.get(cell_bc, 0) + 1
                with open(out_filename, "w") as out_samfile:
                    out_sam = simplesam.Writer(out_samfile, in_sam.header)
                    out_sam.write(read)
            else:
                barcodes[cell_bc] += 1
                with open(out_filename, "a") as out_samfile:
                    print(str(read), end="", file=out_samfile)
    in_sam.close()
Beispiel #5
0
    def _get_base_alignment(self, read):
        if self.bwapy_aligner:
            alignments = self.bwapy_aligner.align_seq(''.join(read.sequence))
            if len(alignments) == 0:
                return None
            alignment = alignments[0]
            cigar = alignment.cigar
            is_reverse_complement = alignment.orient == '-'
            mapped_position = alignment.pos

        else:
            read_fastq_filename = None
            with tempfile.NamedTemporaryFile(mode='w',
                                             delete=False,
                                             prefix='nadavca_tmp',
                                             suffix='.fastq') as file:
                read_fastq_filename = file.name
                file.write(read.fastq)

            bwa_output_filename = None
            with tempfile.NamedTemporaryFile(delete=True,
                                             prefix='nadavca_tmp',
                                             suffix='.sam') as file:
                bwa_output_filename = file.name

            subprocess.run([
                self.bwa_executable, 'mem', self.reference_filename,
                read_fastq_filename, '-o', bwa_output_filename
            ],
                           stderr=subprocess.PIPE,
                           check=True)
            with simplesam.Reader(open(bwa_output_filename, 'r')) as reader:
                sam = reader.next()
                if not sam.mapped:
                    return None
                cigar = sam.cigar
                is_reverse_complement = sam.reverse
                mapped_position = sam.pos - 1

            os.remove(read_fastq_filename)
            os.remove(bwa_output_filename)

        oriented_read = Genome.reverse_complement(
            read.sequence) if is_reverse_complement else read.sequence

        index_in_read = 0
        index_in_reference = mapped_position
        base_mapping = []
        parsed_cigar = self._parse_cigar(cigar)

        for num, operation in parsed_cigar:
            if operation == 'S':
                index_in_read += num
            elif operation == 'M':
                for i in range(num):
                    if self.reference[index_in_reference] == oriented_read[
                            index_in_read]:
                        base_mapping.append(
                            (index_in_read, index_in_reference))
                    index_in_read += 1
                    index_in_reference += 1
            elif operation == 'D':
                index_in_reference += num
            elif operation == 'I':
                index_in_read += num
            else:
                raise ValueError(
                    'Unknown cigar operation: {}'.format(operation))

        if is_reverse_complement:
            for i, val in enumerate(base_mapping):
                base_mapping[i] = (len(read.sequence) - 1 - val[0],
                                   len(self.reference) - 1 - val[1])
            base_mapping.reverse()

        return numpy.array(base_mapping,
                           dtype=numpy.int), is_reverse_complement
def processRegion(R):  #samtools format region string
    chrom = R.split(":")[0]
    global CONTIGS
    if chrom not in CONTIGS: return []
    start = int(R.split(":")[1].split("-")[0])
    end = int(R.split(":")[1].split("-")[1])
    predictionArray = [
    ]  #list of feature vectors, 0th element in each vector is chrom and 1st is bp position
    SITES = dict()
    last_site_eval = -1
    nreads = 0
    global fname
    with open(fname, 'rb') as filenameopen:
        samfile = simplesam.Reader(filenameopen, regions=R)

        while True:
            try:  #get next read
                read = samfile.next()

                if read.duplicate or not read.passing or read.secondary:
                    continue
                nreads += 1
                (clip, clipLen, ind, pair, strands) = readFeatures(read)

                gappedSeq = read.gapped('seq')  #one time compute
                gappedQual = read.gapped('qual')
                #Features for the whole read
                for (p, refpos) in enumerate(
                        read.coords
                ):  # "genomic coordinates for the gapped alignment"
                    if gappedSeq[p] == "-":
                        continue
                    refpos = p + read.pos

                    if refpos in SITES:
                        S = SITES[refpos]
                    else:
                        S = [
                            refpos, [], [], [], [], [], [], [], []
                        ]  #bases, bqs, readpos, clip, ind, HP, ASXS, pair, strands, clipLen
                        SITES[refpos] = S

                    S[1].append(gappedSeq[p])
                    S[2].append(ord(gappedQual[p]) - 33)
                    S[3].append(min((len(gappedSeq) - p),
                                    p))  #not exact due to indel
                    S[4].append(clip)
                    S[5].append(ind)
                    S[6].append(pair)
                    S[7].append(strands)
                    S[8].append(clipLen)

                if nreads >= READBATCH or len(
                        SITES
                ) > SITEBATCH:  #batch size for evaluating sites where all reads seen -> memory savings
                    for refpos in list(
                            SITES
                    ):  # This will produce a list from the keys of the dictionary that will not change during iteration
                        if refpos >= max(last_site_eval + 1,
                                         start) and refpos < min(
                                             read.pos, end):
                            S = SITES[refpos]
                        else:
                            continue
                        feature_vector = siteFeatures(S)
                        if type(feature_vector) != type(None):
                            predictionArray.append([chrom, refpos] +
                                                   feature_vector)
                        del SITES[S[0]]
                    last_site_eval = read.pos - 1
                    nreads = 0

            except StopIteration:  #no more reads

                for refpos in list(
                        SITES
                ):  # This will produce a list from the keys of the dictionary that will not change during iteration
                    if refpos >= max(last_site_eval + 1,
                                     start) and refpos < min(read.pos, end):
                        S = SITES[refpos]
                    else:
                        continue
                    feature_vector = siteFeatures(S)
                    if type(feature_vector) != type(None):
                        predictionArray.append([chrom, refpos - 1] +
                                               feature_vector)
                    del SITES[S[0]]
                break  #all sites classified
        samfile.close()
        samfile.p.wait()  #Prevent Z-status samtools processes
        sys.stderr.write(R + "\n")
        return predictionArray
        sys.stderr.write(R + "\n")
        return predictionArray


parser = argparse.ArgumentParser(
    description='python filter.py --bam sample.bam --nproc 8')
parser.add_argument('--bam', help='Input bam file name', required=True)
parser.add_argument('--nproc', help="parallelism", required=False, default=1)

args = parser.parse_args()
NFEAT = 33
NPAR = int(args.nproc)
fname = args.bam

with open(fname, 'rb') as F:
    S = simplesam.Reader(F)
    regions = S.tile_genome(WINDOW_SIZE)
    S.close()
    S.p.wait()

if NPAR > 1:
    import multiprocessing as mp
    pool = mp.Pool(
        processes=NPAR, maxtasksperchild=1
    )  #new process is started for each region, max NPAR active at a time
    for result in pool.imap_unordered(processRegion, regions):
        p = [str(x) for x in result]  #predictionArray
        if len(p) > 0: print("\n".join(p))
else:
    for r in regions:
        p = [str(x) for x in processRegion(r)]  #predictionArray
Beispiel #8
0
import pyfastx
import simplesam
import os

os.chdir(
    '/research/projects/yu3grp/IO_JY/yu3grp/LVXSCID/patients_scATACseq/multiome_P1'
)
bam_file = './03_chimeric/P1_scMulti_ATAC_S1_pe.mated.filter.bam'
out_sam_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_wCB.sam'
cellID_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_R2.fastq'

#fa = pyfastx.Fastx('./LVX_SCID_P1_S1_L001_pe.mated.filter2.bam_readbarcode')
fa = pyfastx.Fastx(cellID_file)

barcodes = {}
for name, seq, qual, comment in fa:
    barcodes[name] = seq

barcode_tag = 'CB'

with simplesam.Reader(open(bam_file)) as in_bam:
    with simplesam.Writer(open(out_sam_file, 'w'), in_bam.header) as out_sam:
        for read in in_bam:
            #read[umi_tag] = barcodes[read.qname][0]
            read[barcode_tag] = barcodes[read.qname]
            out_sam.write(read)
Beispiel #9
0
def processRegion(R):  #bed interval -> samtools format region string
    global fname
    chrom = R[0]
    start = int(R[1])
    r = chrom + ":" + str(start + 1) + "-" + str(start + 2)  #1-indexed
    alignments = []
    alignmentStarts = []
    alignmentEnds = []
    readNames = []
    basesAtSite = []
    pileupStart = None
    pileupEnd = None

    with open(fname, 'rb') as filenameopen:
        samfile = simplesam.Reader(filenameopen, regions=r)
        global fastafname
        reference = Fasta(
            fastafname
        )  #Fasta() is apparently not "thread safe" to use as global and creating per-process does not affect performance
        numPhased = 0
        while True:
            try:  #get next read
                read = samfile.next()
                if read.duplicate or not read.passing or read.secondary:
                    continue
                readNames.append(read.qname)
                try:
                    HP = read["PH"]
                    numPhased += 1
                except:
                    HP = None
                C = read.cigars
                if C[0][1] == "S":
                    qstart = C[0][0]
                else:
                    qstart = 0
                rstart = read.pos
                if pileupStart is None or rstart < pileupStart:
                    pileupStart = rstart
                if C[-1][1] == "S":
                    qend = C[-1][0]
                else:
                    qend = 0
                rend = rstart + len(read)
                if pileupEnd is None or rend > pileupEnd: pileupEnd = rend

                qSeqAln = read.gapped('seq')

                #mismatches and indels
                alignmentStarts.append(rstart)
                alignmentEnds.append(rend)

                refSeq = reference[R[0]][rstart - 1:rend - 1].seq.upper()
                thisReadAln = []
                P = read.coords  #positions in reference for qSeqAln
                addedBase = False
                try:
                    for i in range(len(P)):  #position in reference
                        refPos = P[i]
                        qChar = qSeqAln[i]
                        if refPos == start:  #don't include the site of interest - see if they cluster otherwise.
                            if i > 0 and refPos - 1 != P[
                                    i - 1] or qChar != refSeq[refPos - rstart]:
                                basesAtSite.append(1)
                            else:
                                basesAtSite.append(0)
                            addedBase = True
                        elif i > 0 and refPos - 1 != P[i - 1]:
                            if len(thisReadAln) > 0: thisReadAln[-1] = 1
                            #insertion in query - change previous position in ref.
                        elif qChar == "-":
                            thisReadAln.append(1)
                            #deletion in query
                        elif qChar != refSeq[refPos - rstart]:
                            thisReadAln.append(1)
                            #mismatch
                        else:
                            thisReadAln.append(0)
                except IndexError:
                    sys.stderr.write(str(len(P)) + "\n")
                    sys.stderr.write(str(len(qSeqAln)) + "\n")
                    sys.stderr.write(str(len(refSeq)) + "\n")
                    sys.stderr.write(str(max(P)) + "\n")
                    sys.stderr.write(str(rstart) + "\n")
                    sys.stderr.write(str(P) + "\n")
                    sys.stderr.write(str(qSeqAln) + "\n")
                    sys.stderr.write(str(refSeq) + "\n")
                    return None

                alignments.append(thisReadAln)
                if not addedBase:
                    basesAtSite.append(1)  #read didn't align to that position
            except StopIteration:  #no more reads
                depth = len(readNames)
                if depth <= 0:
                    predictionString = None
                    sys.stderr.write(str(R) + "\n")
                    break
                fracPhased = 1.0 * numPhased / depth
                totalLen = pileupEnd - pileupStart + 2
                depthAlt = [0 for _ in range(totalLen)]
                depthRef = [0 for _ in range(totalLen)]
                sumRef = [0 for i in range(totalLen)]
                sumAlt = [0 for i in range(totalLen)]
                readsAtSite = [None for _ in range(depth)]

                for (i, seq) in enumerate(alignments):
                    L = len(seq)
                    readsAtSite[i] = [
                        0 for _ in range(alignmentStarts[i] - pileupStart)
                    ] + [1] + seq + [1] + [
                        0 for _ in range(pileupEnd - alignmentStarts[i] - L)
                    ]  #the 1's surrounding seq indicate the read ends - hopefully columns are still in line.
                    if basesAtSite[i] == 0:
                        for j in range(-1, L + 1):
                            depthRef[j + alignmentStarts[i] - pileupStart +
                                     1] += 1
                        for j in range(totalLen):
                            sumRef[j] += readsAtSite[i][j]  #events on h0
                    else:
                        for j in range(-1, L + 1):
                            depthAlt[j + alignmentStarts[i] - pileupStart +
                                     1] += 1
                        for j in range(totalLen):
                            sumAlt[j] += readsAtSite[i][j]  #events on h1

                min_pval = 1
                min_table = None
                min_start = 0
                global V
                for k in range(totalLen):
                    if chrom + ":" + str(pileupStart + k) in V:
                        continue  #Called variant from VCF
                    p = pvalue(sumAlt[k], depthAlt[k] - sumAlt[k], sumRef[k],
                               depthRef[k] - sumRef[k])
                    if p.two_tail < min_pval:
                        min_pval = p.two_tail
                if min_pval < MIN_FISHER_PVAL: predictionString = None
                else:
                    predictionString = ("\t".join(
                        R + [str(min_pval), str(fracPhased)]))
                break

    samfile.close()
    samfile.p.wait()  #Prevent Z-status samtools process
    return predictionString