コード例 #1
0
def genrefblks(readseq, chrom, start, stop, strand, cigar, nreads):
    refpos = start
    readpos = 0
    if strand == '-':
        readseq = reverse_complement(readseq)

    tleftlim, trightlim = start + ENDTRIM, stop - ENDTRIM
    qleftlim, qrightlim = ENDTRIM, len(readseq) - ENDTRIM
    #print '==============='
    #print readseq, chrom, start, stop, strand, cigar, nreads

    if transcriptome:
        print 'P', nreads, chrom, strand, tleftlim, trightlim

    cigarcommands = cigar_pattern.findall(cigar)
    if cigarcommands[0][
            1] == 'S':  # shift start site for the first soft clipping
        start -= int(cigarcommands[0][0])
    if len(cigarcommands
           ) > 1 and cigarcommands[-1][1] == 'S':  # last soft clipping
        stop += int(cigarcommands[-1][0])

    for num, cmd in cigarcommands:
        num = int(num)
        if cmd == 'M':  # match
            mleft = max(qleftlim, readpos)
            mright = min(qrightlim, readpos + num)
            if mleft < mright:
                seq = readseq[mleft:mright]
                print 'M', nreads, chrom, strand, max(refpos, tleftlim), seq
            refpos += num
            readpos += num
        elif cmd == 'S':  # soft clip
            readpos += num
        elif cmd == 'N':  # skip
            refpos += num
        elif cmd == 'D':  # deletion
            if tleftlim <= refpos < trightlim:
                print 'D', nreads, chrom, strand, refpos, num
            refpos += num
        elif cmd == 'I':  # insertion
            ppos = (refpos if strand == '+' else (refpos - 1))
            if tleftlim <= ppos < trightlim:
                print 'I', nreads, chrom, strand, ppos, num
            readpos += num
        elif cmd == 'H':  # hard clipping
            pass
        else:
            print 'E', nreads, num, cmd, readseq
            raise ValueError

    if strand == '+':
        fivep, threep = start, stop - 1
    else:
        fivep, threep = stop - 1, start

    print '5', nreads, chrom, strand, fivep
    print '3', nreads, chrom, strand, threep
コード例 #2
0
def genrefblks(readseq, chrom, start, stop, strand, cigar, nreads):
    refpos = start
    readpos = 0
    if strand == '-':
        readseq = reverse_complement(readseq)

    tleftlim, trightlim = start + ENDTRIM, stop - ENDTRIM
    qleftlim, qrightlim = ENDTRIM, len(readseq) - ENDTRIM
    #print '==============='
    #print readseq, chrom, start, stop, strand, cigar, nreads

    if transcriptome:
        print 'P', nreads, chrom, strand, tleftlim, trightlim

    cigarcommands = cigar_pattern.findall(cigar)
    if cigarcommands[0][1] == 'S': # shift start site for the first soft clipping
        start -= int(cigarcommands[0][0])
    if len(cigarcommands) > 1 and cigarcommands[-1][1] == 'S': # last soft clipping
        stop += int(cigarcommands[-1][0])

    for num, cmd in cigarcommands:
        num = int(num)
        if cmd == 'M': # match
            mleft = max(qleftlim, readpos)
            mright = min(qrightlim, readpos + num)
            if mleft < mright:
                seq = readseq[mleft:mright]
                print 'M', nreads, chrom, strand, max(refpos, tleftlim), seq
            refpos += num
            readpos += num
        elif cmd == 'S': # soft clip
            readpos += num
        elif cmd == 'N': # skip
            refpos += num
        elif cmd == 'D': # deletion
            if tleftlim <= refpos < trightlim:
                print 'D', nreads, chrom, strand, refpos, num
            refpos += num
        elif cmd == 'I': # insertion
            ppos = (refpos if strand == '+' else (refpos-1))
            if tleftlim <= ppos < trightlim:
                print 'I', nreads, chrom, strand, ppos, num
            readpos += num
        elif cmd == 'H': # hard clipping
            pass
        else:
            print 'E', nreads, num, cmd, readseq
            raise ValueError

    if strand == '+':
        fivep, threep = start, stop-1
    else:
        fivep, threep = stop-1, start

    print '5', nreads, chrom, strand, fivep
    print '3', nreads, chrom, strand, threep
コード例 #3
0
def process(nrlist):
    nucleotidetracks = [rseqarr.TRACKS.index(i) for i in 'ACGT']

    for nracc in nrlist:
        dbinfo = refFlat[nracc]
        chrom = dbinfo['chrom']
        genomeseq = ''.join(
            mm9.get(chrom, blkstart, blkend)
            for blkstart, blkend in dbinfo['exonBlocks']).upper()
        if dbinfo['strand'] == '-':
            utr3, utr5 = 'leftUtrBlocks', 'rightUtrBlocks'
        else:
            utr5, utr3 = 'leftUtrBlocks', 'rightUtrBlocks'

        if nracc.startswith('NM_'):
            utr5length = blklength(dbinfo[utr5])
            cdslength = blklength(dbinfo['cdsBlocks'])
            utr3length = blklength(dbinfo[utr3])
        else:
            exonlength = blklength(dbinfo['exonBlocks'])

        cntarray = rseqarr.get_blocks(dbinfo['chrom'], dbinfo['exonBlocks'],
                                      dbinfo['strand'])[nucleotidetracks]
        depthcnt = np.array(cntarray.sum(0).clip(1), 'd')
        confidentcalls = ((cntarray / depthcnt >= MINPERCENTTOCALL) *
                          (depthcnt >= MINREADSTOCALL))

        mutatedseq = list(genomeseq)
        for base, calls in zip('ACGT', confidentcalls):
            for pos in np.where(calls)[0]:
                mutatedseq[pos] = base

        mutatedseq = ''.join(mutatedseq)

        if dbinfo['strand'] == '-':
            mutatedseq = reverse_complement(mutatedseq)

        if nracc.startswith('NM_'):
            print >> bedout, '\t'.join([
                nracc,
                str(utr5length),
                str(utr5length + cdslength),
                '%s' % dbinfo['geneName'], '.', '+'
            ])
        else:
            print >> bedout, '\t'.join([
                nracc, '0',
                str(exonlength),
                '%s' % dbinfo['geneName'], '.', '+'
            ])

        print >> faout, '>%s %s' % (nracc, dbinfo['geneName'])
        faout.write(textwrap(mutatedseq))
コード例 #4
0
def process(nrlist):
    nucleotidetracks = [rseqarr.TRACKS.index(i) for i in 'ACGT']

    for nracc in nrlist:
        dbinfo = refFlat[nracc]
        chrom = dbinfo['chrom']
        genomeseq = ''.join(mm9.get(chrom, blkstart, blkend)
                            for blkstart, blkend in dbinfo['exonBlocks']).upper()
        if dbinfo['strand'] == '-':
            utr3, utr5 = 'leftUtrBlocks', 'rightUtrBlocks'
        else:
            utr5, utr3 = 'leftUtrBlocks', 'rightUtrBlocks'

        if nracc.startswith('NM_'):
            utr5length = blklength(dbinfo[utr5])
            cdslength = blklength(dbinfo['cdsBlocks'])
            utr3length = blklength(dbinfo[utr3])
        else:
            exonlength = blklength(dbinfo['exonBlocks'])

        cntarray = rseqarr.get_blocks(dbinfo['chrom'], dbinfo['exonBlocks'],
                                      dbinfo['strand'])[nucleotidetracks]
        depthcnt = np.array(cntarray.sum(0).clip(1), 'd')
        confidentcalls = ((cntarray/depthcnt >= MINPERCENTTOCALL) *
                          (depthcnt >= MINREADSTOCALL))

        mutatedseq = list(genomeseq)
        for base, calls in zip('ACGT', confidentcalls):
            for pos in np.where(calls)[0]:
                mutatedseq[pos] = base

        mutatedseq = ''.join(mutatedseq)

        if dbinfo['strand'] == '-':
            mutatedseq = reverse_complement(mutatedseq)

        if nracc.startswith('NM_'):
            print >> bedout, '\t'.join([nracc, str(utr5length),
                                        str(utr5length + cdslength),
                                        '%s' % dbinfo['geneName'], '.', '+'])
        else:
            print >> bedout, '\t'.join([nracc, '0', str(exonlength),
                                        '%s' % dbinfo['geneName'], '.', '+'])

        print >> faout, '>%s %s' % (nracc, dbinfo['geneName'])
        faout.write(textwrap(mutatedseq))
コード例 #5
0
ファイル: sam.py プロジェクト: jimkwon/nrclip
    def iteralignments(self, strands='+-', withref=False):
        geteditdist= lambda x: x[4]

        for line in self.samfile:
            fields = line[:-1].split('\t')

            if line[0] == '@':
                if line[:3] != '@SQ':
                    continue
                sqname = fields[1][3:]
                sqlen = [int(fl[3:]) for fl in fields[2:] if fl[:3] == 'LN:'][0]
                self.seqlen[sqname] = sqlen
                continue

            qname = fields[0]
            flags = int(fields[1])
            rname = fields[2]
            pos = int(fields[3]) # 1-based leftmost
            mapq = int(fields[4]) # phred-scaled
            cigar = fields[5]
            seq = fields[9]
            options = dict(v.split(':', 1) for v in fields[11:])

            if flags & F_REVERSE_STRAND:
                strand = '-'
                seq = reverse_complement(seq)
            else:
                strand = '+'

            editdist = int(options.get('NM', 'i:-1')[2:])

            if rname == '*' or strand not in strands:
                mapped = []
            else:
                reflen, _ = calculate_cigar_length(cigar)
                stop = pos + reflen - 1
                start = pos - 1 if self.zerobase else pos
                mapped = [(rname, start, stop, strand, editdist, cigar)]

            for altmatch in options.get('XA', 'Z:')[2:].split(';')[:-1]:
                altfields = altmatch.split(',')
                strand = altfields[1][0]
                pos = int(altfields[1][1:])
                rname = altfields[0]
                cigar = altfields[2]
                editdist = int(altfields[3])
                reflen, _ = calculate_cigar_length(cigar)
                stop = pos + reflen - 1
                start = pos - 1 if self.zerobase else pos

                if strand in strands:
                    mapped.append((rname, start, stop, strand, editdist,
                                   cigar))

            # search for alternative reads
            for altline in self.samfile:
                altfields = altline[:-1].split('\t')
                altqname = altfields[0]
                altflags = int(altfields[1])
                if altqname != qname:
                    self.samfile.push(altline)
                    break

                altrname = altfields[2]
                altpos = int(altfields[3]) # 1-based leftmost
                altmapq = int(altfields[4]) # phred-scaled
                altcigar = altfields[5]
                altseq = altfields[9]
                altoptions = dict(v.split(':', 1) for v in altfields[11:])

                if altflags & F_REVERSE_STRAND:
                    altstrand = '-'
                    altseq = reverse_complement(altseq)
                else:
                    altstrand = '+'

                alteditdist = int(altoptions.get('NM', 'i:-1')[2:])

                if altrname != '*' and altstrand in strands:
                    altreflen, _ = calculate_cigar_length(altcigar)
                    altstop = altpos + altreflen - 1
                    altstart = altpos - 1 if self.zerobase else altpos
                    mapped.append((altrname, altstart, altstop, altstrand,
                                   alteditdist, altcigar))

            mapped.sort(key=geteditdist)

            if withref:
                newmapped = []
                for m in mapped:
                    subseq = self.getsubseq(m[0], m[1]-1, m[2])
                    if m[3] == '-':
                        subseq = reverse_complement(subseq)
                    newmapped.append(m + (subseq,))
                mapped = newmapped

            yield {
                'qname': qname,
                'flags': flags,
                'mapq': mapq,
                'seq': seq,
                'options': options,
                'mapped': mapped, # positions are 1-based
            }