Ejemplo n.º 1
0
def main_toBed(args):
    def proc_func_toBed(hchr, hstr, hpos, hend, mchr, mstr, mpos, mend, m, s,
                        t, data):

        htoken = '%s:%d_%d:%s' % (hchr, hpos, hend, hstr)
        mtoken = '%s:%d_%d:%s' % (mchr, mpos, mend, mstr)
        hp = []
        mp = []
        gaps = []
        hp1 = 0
        mp1 = 0

        hgenome, mgenome = data
        if hgenome:
            hseq = hgenome.fetch_sequence(hchr, hpos, hend, uppercase=True)
            if hstr == '-':
                hseq = reverse_complement(hseq)
        if mgenome:
            mseq = mgenome.fetch_sequence(mchr, mpos, mend, uppercase=True)
            if mstr == '-':
                mseq = reverse_complement(mseq)

        for i, m1 in enumerate(m):

            wprint('\t'.join(
                map(str, [
                    hchr, hpos + hp1 if hstr == '+' else hend - hp1 - m1,
                    hpos + hp1 + m1 if hstr == '+' else hend - hp1, htoken,
                    mchr, mpos + mp1 if mstr == '+' else hend - mp1 - m1,
                    mpos + mp1 + m1 if mstr == '+' else mend - mp1, mtoken
                ])))

            if hseq and mseq:
                hs = []
                ms = []
                for j in xrange(m1):
                    hb = hseq[hp1 + j]
                    mb = mseq[mp1 + j]
                    if hb == mb:
                        hs.append(MCHC + hb + ENDC)
                        ms.append(MCHC + mb + ENDC)
                    else:
                        hs.append(MISC + hb + ENDC)
                        ms.append(MISC + mb + ENDC)
                wprint(''.join(hs))
                wprint(''.join(ms))
            hp1 += m1
            mp1 += m1

            if i != len(m) - 1:
                hp1 += s[i]
                mp1 += t[i]

    import faidx
    h_genome = faidx.RefGenome(args.g1) if args.g1 else None
    m_genome = faidx.RefGenome(args.g2) if args.g2 else None
    slice_chains(args.i, proc_func_toBed, (h_genome, m_genome))
Ejemplo n.º 2
0
def main(args):

    ref = faidx.RefGenome(args.ref)
    prev_chrm = None
    prev_beg = None
    prev_end = None
    for line in args.bed:
        fields = line.strip().split('\t')
        chrm = fields[0]
        beg = int(fields[1])
        end = int(fields[2])
        val = float(fields[3])
        if chrm != prev_chrm or beg != prev_beg or end != prev_end:
            if not prev_end is None:
                print '\t'.join(map(str, [prev_chrm, prev_beg-1, prev_end, tval]))
            tval = 0.0
            if chrm != prev_chrm:
                if not prev_end is None:
                    print '\t'.join(map(str, [prev_chrm, prev_end, ref.chrm2len(prev_chrm), 0]))
                print '\t'.join(map(str, [chrm, 0, beg-1, 0]))
            elif prev_end != beg-1:
                print '\t'.join(map(str, [chrm, prev_end, beg-1, 0]))
                
        if args.op == 'sum':
            tval += val

        prev_chrm = chrm
        prev_beg = beg
        prev_end = end

    if prev_chrm is not None:
        print '\t'.join(map(str, [prev_chrm, prev_beg-1, prev_end, tval]))
Ejemplo n.º 3
0
 def __init__(self, window_queue, result_queue, winlen, vcf_path,
              reference_path):
     multiprocessing.Process.__init__(self)
     self.window_queue = window_queue
     self.result_queue = result_queue
     self.reference = faidx.RefGenome(reference_path)
     self.vcf = pysam.Tabixfile(vcf_path)
     self.winlen = winlen
Ejemplo n.º 4
0
def main_CpH(args):

    # processes = multiprocessing.cpu_count() - 2
    num_processes = 25

    if args.chrm:
        chrms = [args.chrm]
    else:
        chrms = [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY', 'chrM'
        ]

    windows = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()

    consumers = []
    for i in xrange(num_processes):
        c = Consumer(windows, results, args.winlen, args.vcf, args.ref)
        c.start()
        consumers.append(c)

    writer = Writer(results)
    writer.start()

    # create windows
    r = faidx.RefGenome(args.ref)
    for chrm in chrms:
        chrmlen = r.faidx[chrm][0]
        step = args.winlen
        # start from the second base, since we are looking at binucleotides
        for winbeg in xrange(2, chrmlen - args.winlen, step):
            windows.put((chrm, winbeg))

    # put poison pills
    for i in xrange(num_processes):
        windows.put(None)

    # print 'before joining'
    windows.join()
    # print 'windows joined'

    # consumers need to be joined so that
    # the items put on the queue got flushed
    for c in consumers:
        c.join()

    results.put(None)
    writer.join()
    # print 'writer joined'

    return
Ejemplo n.º 5
0
def main_runningcomp(args):
    """ export local composition at every base """
    def compute_comp(seq):
        return (seq.count('C'), seq.count('G'), len(seq) - seq.count('N'),
                seq.count('CG'))

    import faidx
    genome = faidx.RefGenome(args.i)
    out = open(args.o, 'w') if args.o is not None else sys.stdout
    for chrm in genome.faidx:
        if args.v:
            err_print(chrm)
        gseq = genome.fetch_chrmseq(chrm).upper()
        if len(gseq) <= args.k * 2:
            continue
        c, g, n, cg = compute_comp(gseq[:args.k * 2])
        for i in range(len(gseq) - args.k * 2):
            b1 = gseq[i]
            b2 = gseq[i + args.k * 2]
            if b1 == 'N':
                pass
            elif b1 == 'C':
                n -= 1
                c -= 1
                if gseq[i + 1] == 'G':
                    cg -= 1
            elif b1 == 'G':
                n -= 1
                g -= 1
            elif b1 == 'A' or b1 == 'T':
                n -= 1
            else:
                raise Exception("Unknown base: %s" % b1)

            if b2 == 'N':
                pass
            elif b2 == 'C':
                n += 1
                c += 1
            elif b2 == 'G':
                n += 1
                g += 1
                if gseq[i + args.k * 2 - 1] == 'C':
                    cg += 1
            elif b2 == 'A' or b2 == 'T':
                n += 1
            else:
                raise Exception('Unknown base: %s' % b2)

            if i % args.s == 0:
                out.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\n" %
                          (chrm, i + args.k - 1, i + args.k, c, g, cg, n))
Ejemplo n.º 6
0
def main_comp(args):

    import faidx, re
    genome = faidx.RefGenome(args.i)
    m = re.match(r'([^:]*):(\d+)-(\d+)', args.g)
    chrm = m.group(1)
    beg = int(m.group(2))
    end = int(m.group(3))
    seq = genome.fetch_sequence(chrm, beg, end, uppercase=True)
    print('n:%d' % (end - beg))
    print('C:%d' % seq.count('C'))
    print('G:%d' % seq.count('G'))
    print('CG:%d' % seq.count('CG'))
Ejemplo n.º 7
0
def load_te_and_seqs(
        rmskbed='/Users/wandingzhou/projects/pj-mm/2015-04-23-alu/rmsk.bed.gz',
        load_seq=False,
        tetype=None,
        tetype2=None,
        tetype3=None):

    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    tes = {}
    wzcore.err_print_sig()
    for i, line in enumerate(wzcore.opengz(rmskbed)):
        if i % 100000 == 0:
            wzcore.err_print_m(' %d' % i)

        fields = line.strip().split('\t')
        te = TE()
        te.chrm = fields[0]
        if te.chrm.find('_') > 0:
            continue
        te.beg = int(fields[1])
        te.end = int(fields[2])
        te.rmskbed = rmskbed
        te.strand = fields[3]
        te.tetype = fields[4]
        te.tetype2 = fields[5]
        te.tetype3 = fields[6]

        if tetype is not None and te.tetype != tetype:
            continue

        if tetype2 is not None and te.tetype2 != tetype2:
            continue

        if tetype3 is not None and te.tetype3 != tetype3:
            continue

        if load_seq:
            try:
                _te_load_seqs(refgenome, te)
            except IndexError:  # TE at chromosome boundaries, ignore
                # te.seq == None
                pass

        tes[(te.chrm, te.beg, te.end)] = te

    wzcore.err_print_m('\n')
    wzcore.err_print('Loaded %d TEs' % len(tes))
    return tes
Ejemplo n.º 8
0
def main_icc(args):
    """ incomplete conversion """

    ref = faidx.RefGenome(args.ref)
    refseq = ref.fetch_chrmseq('chrM')

    bam = pysam.Samfile(args.bam)
    n_inc = 0
    n_com = 0
    for x in bam.fetch(reference='chrM'):

        # only consider primary alignment
        if x.is_secondary:
            continue

        strand_tag = dict(x.tags)['ZS']

        (nG2A, nC2T, nG2G, nC2C) = count_read_retention(x, refseq, args.m)
        if strand_tag == '++' or strand_tag == '+-':
            n_inc += nC2C
            n_com += nC2T
        else:  # -+ or --
            n_inc += nG2G
            n_com += nG2A

        if args.v:
            if strand_tag in ['++', '+-'] and nC2C > 0:
                print '\t'.join(
                    map(str, [
                        nC2C, (nC2C + nC2T),
                        float(nC2C) / (nC2C + nC2T), strand_tag, nC2C, nC2T,
                        nG2G, nG2A, x.qname, x.pos, x.tid, x.flag
                    ]))
            elif strand_tag in ['-+', '--'] and nG2G > 0:
                print '\t'.join(
                    map(str, [
                        nG2G, (nG2G + nG2A),
                        float(nG2G) / (nG2G + nG2A), strand_tag, nC2C, nC2T,
                        nG2G, nG2A, x.qname, x.pos, x.tid, x.flag
                    ]))

    if not args.v:
        print '\t'.join(
            map(str,
                [n_inc, n_com, float(n_inc) / (n_inc + n_com)]))
Ejemplo n.º 9
0
def te_load_seqs(tes):

    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')

    tes2 = {}
    if isinstance(tes, dict):
        it = tes.itervalues()
    else:
        it = iter(tes)
    for te in it:
        try:
            _te_load_seqs(refgenome, te)
        except IndexError:  # TE at chromosome boundaries, ignore
            continue

        tes2[(te.chrm, te.beg, te.end)] = te

    return tes2
Ejemplo n.º 10
0
def main_getfasta(args):

    import faidx
    genome = faidx.RefGenome(args.f)
    out = open(args.o, 'w') if args.o is not None else sys.stdout
    for line in args.i:
        fields = line.strip().split('\t')
        # print(fields[0])
        chrm = fields[0]
        beg = int(fields[1])
        end = int(fields[2])

        try:
            seq = genome.fetch_sequence(chrm, beg + 1, end)
        except IndexError:
            seq = "OVERFLOW"

        out.write('%s\t%s\n' % (line.strip(), seq))
Ejemplo n.º 11
0
def load_cgi_and_seqs():

    cgis = []
    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    for line in wzcore.opengz(
            '/Users/wandingzhou/projects/hs-tcga/data/2015_03_24_cpg_island/TakaiJones/takai.jones.strict.bed.gz'
    ):
        fields = line.strip().split('\t')
        cgi = CGI()
        cgi.chrm = fields[0]
        cgi.beg = int(fields[1])
        cgi.end = int(fields[2])
        cgi.cgitype = fields[3]
        cgi.seq = refgenome.fetch_sequence(cgi.chrm, cgi.seq_beg(),
                                           cgi.seq_end()).upper()
        cgis.append(cgi)

    wzcore.err_print('Loaded %d CGIs' % len(cgis))
    return cgis
Ejemplo n.º 12
0
def tss_load_cpgdensity(tss_table):

    d_upstream = 2500
    d_downstream = 2500
    poses = range(-d_upstream, d_downstream + 1)

    import faidx
    import numpy as np
    import pandas as pd
    refgenome = faidx.RefGenome('/Users/wandingzhou/references/hg19/hg19.fa')
    _CpG_density = []
    j = 0
    for k, f in tss_table.iterrows():
        chrm = f[0]
        tss = f[1]
        strand = f[2]

        j += 1
        # if j%1000 ==0:
        # print j
        if strand == "+":
            d1 = d_upstream
            d2 = d_downstream
        else:
            d1 = d_downstream
            d2 = d_upstream

        _cg = [np.nan] * (d1 + d2 + 1)
        genome_seq = refgenome.fetch_sequence(chrm, tss - d1 - 200,
                                              tss + d2 + 200).upper()
        for i in xrange(d1 + d2 + 1):
            seq = genome_seq[200 + i - 50:200 + i + 50]
            if strand == '+':
                _cg[i] = seq.count('CG')
            else:
                _cg[d1 + d2 - i] = seq.count('CG')

        _CpG_density.append(_cg)
    CpG_density = pd.DataFrame(_CpG_density,
                               index=tss_table.index,
                               columns=poses)

    return CpG_density
Ejemplo n.º 13
0
def main_orphan(args):

    import faidx
    genome = faidx.RefGenome(args.i)
    out = open(args.o, 'w') if args.o is not None else sys.stdout
    for c in genome.faidx:
        if args.v:
            err_print(c)
        gseq = genome.fetch_chrmseq(c)
        prev = None
        prev_is_good_left = True
        for i in range(len(gseq) - 2):

            if gseq[i] == 'C' and gseq[i + 1] == 'G':
                if prev and prev_is_good_left and i - prev >= args.l:
                    tprint([c, prev, prev + 2, '+'], out)
                if prev:
                    prev_is_good_left = i - prev >= args.l
                prev = i
        if prev_is_good_left and prev is not None:
            tprint([c, prev, prev + 2, '+'], out)
Ejemplo n.º 14
0
def main_printc(args):

    import faidx
    genome = faidx.RefGenome(args.i)
    out = open(args.o, 'w') if args.o is not None else sys.stdout
    for c in genome.faidx:
        if args.v:
            err_print(c)
        gseq = genome.fetch_chrmseq(c)
        for i in range(len(gseq) - 2):
            # for CG (symmetric), print the position of CG (2-bases)
            if gseq[i] == 'C' and gseq[i + 1] == 'G':
                tprint([c, i, i + 2, 'CG', '+', 'CG'], out)

            # CHGs are assymetric
            # for example, CCG is a CHG but its reverse complement CGG is not a CHG
            if gseq[i] == 'C' and gseq[i + 1] != 'G' and gseq[i + 2] == 'G':
                if gseq[i + 1] != 'N':
                    tprint([c, i, i + 1, 'CHG', '+', gseq[i:i + 3]], out)

            if gseq[i] == 'C' and gseq[i + 1] != 'C' and gseq[i + 2] == 'G':
                if gseq[i + 1] != 'N':
                    tprint([
                        c, i + 2, i + 3, 'CHG', '-',
                        reverse_complement(gseq[i:i + 3])
                    ], out)

            # for CHH, print the position of C
            if gseq[i] == 'C' and gseq[i + 1] != 'G' and gseq[i + 2] != 'G':
                if gseq[i + 1] != 'N' and gseq[i + 2] != 'N':
                    tprint([c, i, i + 1, 'CHH', '+', gseq[i:i + 3]], out)
            if gseq[i] != 'C' and gseq[i + 1] != 'C' and gseq[i + 2] == 'G':
                if gseq[i] != 'N' and gseq[i + 1] != 'N':
                    tprint([
                        c, i + 2, i + 3, 'CHH', '-',
                        reverse_complement(gseq[i:i + 3])
                    ], out)
Ejemplo n.º 15
0
def main_one(args):

    samrec = args.sam
    ref = faidx.RefGenome(args.ref)

    samfields = samrec.split()
    # print samfields
    chrm = samfields[2]
    read = Read
    read.pos = int(samfields[3]) - 1  # the raw sequence is 1-based, shift back
    read.seq = samfields[9]
    read.cigar = parse_cigar(samfields[5])
    read.qname = samfields[0]
    read.flag = int(samfields[1])
    read.is_read1 = read.flag & 0x40
    read.is_read2 = read.flag & 0x80
    read.is_reverse = read.flag & 0x10
    read.qual = samfields[10]
    read.tags3 = [_tagstr.split(':') for _tagstr in samfields[11:]]
    read.tags = [(_[0], _[2]) for _ in read.tags3]
    print read.tags

    refbeg = read.pos - 100 if read.pos > 100 else 1
    refend = read.pos + len(read.seq) + 100
    refseq = ref.fetch_sequence(chrm, refbeg, refend).upper()
    rpos = read.pos - refbeg + 1
    qpos = 0

    flen = flank_len
    if flen > rpos:
        flen = rpos

    op, oplen = read.cigar[0]
    if op == 4:
        pr_beg = rpos - flen - oplen
    else:
        pr_beg = rpos - flen
    pp = pr = pq = pqual = ''
    pp += " " * (rpos - pr_beg)
    pr += rprint(refseq[pr_beg:rpos])
    pq += ' ' * flen
    pqual += ' ' * flen
    pbis = ' ' * flen

    for i, (op, clen) in enumerate(read.cigar):
        if op == 0:
            if pp.isspace():
                pp += "|{}:{}".format(chrm, rpos + refbeg - 1)

            pr += rprint(refseq[rpos:rpos + clen])
            pq += qprint(read, qpos, qpos + clen, refseq, rpos)
            pqual += qualprint(read, qpos, qpos + clen)
            pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
            rpos += clen
            qpos += clen
        elif op == 1:
            pr += GAPC + '*' * clen + ENDC
            pq += qprint(read, qpos, qpos + clen, refseq, rpos)
            pqual += qualprint(read, qpos, qpos + clen)
            pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
            qpos += clen
        elif op == 2:
            pr += rprint(refseq[rpos:rpos + clen])
            pq += GAPC + '*' * clen + ENDC
            pqual += '*' * clen
            pbis += '*' * clen
            rpos += clen
        elif op == 4:
            pq += qprint(read, qpos, qpos + clen, refseq, rpos, UNDERLINE)
            pqual += qualprint(read, qpos, qpos + clen)
            pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
            qpos += clen
        else:
            raise Exception("unknown cigar: %d" % op)
    pr += rprint(refseq[rpos:rpos + flen])

    print "\n" + "=" * 5 + read.qname + "=" * 5
    print pp  # position
    print pr  # reference
    print pq  # query sequence
    print pbis  # bisulfite mode
    print pqual  # quality
Ejemplo n.º 16
0
def main_bis(args):

    samfile = pysam.Samfile(args.bam)
    ref = faidx.RefGenome(args.ref)

    read1seen = read2seen = False
    for read in samfile.fetch(region=args.reg):
        if read.qname != args.qname:
            continue

        chrm = samfile.getrname(read.tid)
        refbeg = read.pos - 100 if read.pos > 100 else 1
        refend = read.pos + len(read.seq) + 100
        refseq = ref.fetch_sequence(chrm, refbeg, refend).upper()
        rpos = read.pos - refbeg + 1
        qpos = 0

        flen = flank_len
        if flen > rpos:
            flen = rpos

        op, oplen = read.cigar[0]
        if op == 4:
            pr_beg = rpos - flen - oplen
        else:
            pr_beg = rpos - flen
        pp = pr = pq = pqual = ''
        pp += " " * (rpos - pr_beg)
        pr += rprint(refseq[pr_beg:rpos])
        pq += ' ' * flen
        pqual += ' ' * flen
        if args.pbis: pbis = ' ' * flen

        for i, (op, clen) in enumerate(read.cigar):
            if op == 0:
                if pp.isspace():
                    pp += "|{}:{}".format(chrm, rpos + refbeg - 1)

                pr += rprint(refseq[rpos:rpos + clen])
                pq += qprint(read, qpos, qpos + clen, refseq, rpos)
                pqual += qualprint(read, qpos, qpos + clen)
                if args.pbis:
                    pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
                rpos += clen
                qpos += clen
            elif op == 1:
                pr += GAPC + '*' * clen + ENDC
                pq += qprint(read, qpos, qpos + clen, refseq, rpos)
                pqual += qualprint(read, qpos, qpos + clen)
                if args.pbis:
                    pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
                qpos += clen
            elif op == 2:
                pr += rprint(refseq[rpos:rpos + clen])
                pq += GAPC + '*' * clen + ENDC
                pqual += '*' * clen
                if args.pbis: pbis += '*' * clen
                rpos += clen
            elif op == 4:
                pq += qprint(read, qpos, qpos + clen, refseq, rpos, UNDERLINE)
                pqual += qualprint(read, qpos, qpos + clen)
                if args.pbis:
                    pbis += bsprint(read, qpos, qpos + clen, refseq, rpos)
                qpos += clen
            else:
                raise Exception("unknown cigar: %d" % op)
        pr += rprint(refseq[rpos:rpos + flen])

        print "\n" + "=" * 5 + read.qname + "=" * 5
        print pp
        print pr
        print pq
        print pqual
        if args.pbis: print pbis
        if args.pread:
            print str(read)

        if read.is_read1: read1seen = True
        if read.is_read2: read2seen = True
        if args.pair == '12' and read1seen and read2seen:
            break
        elif args.pair == '1' and read1seen:
            break
        elif args.pair == '2' and read2seen:
            break
        elif args.pair == '0':
            break
Ejemplo n.º 17
0
    def __init__(self,
                 genome,        # faidx.RefGenome
                 cytoband=None,
                 angle_beg=0, angle_end=360,
                 angle_chrm_space=0.05, inner_radius=98, outer_radius=101,
                 angle_inc=0.01, # when plotting an arc, this is the increment
                 bezier_anchor = 15.0, # higher the number the higher the peak
                 track_height = 10,
                 track_space = 1,
                 fontname = 'Arial',
    ):

        if angle_end - angle_beg == 360:
            self.full_circle = True
        else:
            self.full_circle = False
        
        genome_path = genome
        if isinstance(genome, str):
            genome = faidx.RefGenome(genome)
        if isinstance(genome, list):
            chrms = genome
        else:
            chrms = getsortedchrms(genome)

        if cytoband:
            # if re.search('mm10', genome_path) is not None:
            #     self.cytoband_table = pd.read_table(
            #         cytoband, header=None,
            #         names=['_','chrm','band','band2','beg','end','bandtype'])[['chrm','beg','end','bandtype']]
            # elif re.search('hg38', genome_path) is not None:
            self.cytoband_table = pd.read_table(
                cytoband, header=None,
                names=['chrm','beg','end','band','bandtype'])[['chrm','beg','end','bandtype']]
        else:
            self.cytoband_table = None

        self.chrms = [_[0] for _ in chrms]
        self.chrms_plot = self.chrms[:] # plot all by default
        self.chrm2len = dict(chrms)
        self.chrm2angles = {}

        ## plot parameters
        self.angle_beg = angle_beg / 180.0 * np.pi
        self.angle_end = angle_end / 180.0 * np.pi
        self.angle_chrm_space = angle_chrm_space # space between chromosomes
        self.inner_radius = inner_radius
        self.outer_radius = outer_radius
        self.angle_inc = angle_inc
        self.bezier_anchor = bezier_anchor
        self.ax = None
        self.fontname = fontname
        self.track_height = track_height
        self.track_space = track_space
        
        ## derived parameters
        self.connect_radius = inner_radius*0.95
        # self.ax_radius = self.outer_radius*1.05
        self.inner_track_outer_radius = self.inner_radius - self.track_space
        self.inner_track_inner_radius = self.inner_track_outer_radius - self.track_height
        self.outer_track_inner_radius = self.outer_radius + self.track_space
        self.outer_track_outer_radius = self.outer_track_inner_radius + self.track_height

        self.tracks = []

        return
Ejemplo n.º 18
0
def main_mutation(args):
    """ generate genome with mutation from a mutation list """

    genome = faidx.RefGenome(args.genome)
    fmut = open(args.prefix + '/mutations.tsv', 'w')
    for chrm in genome.faidx.keys():  # process each chromosome
        print(chrm)
        chrmseq = list(genome.fetch_chrmseq(chrm))
        original = range(len(chrmseq))
        deletion = [False] * len(chrmseq)
        insertion = [''] * len(chrmseq)

        # sample SNP
        nmut = int(args.m * len(chrmseq))
        print("sampling %d SNPs" % nmut)
        i = 0
        for coord in random.sample(original, 2 * nmut):
            if i >= nmut:
                break
            source = chrmseq[coord]
            if source == 'N':
                continue
            target = random.choice(
                [_ for _ in ['A', 'C', 'G', 'T'] if _ != chrmseq[coord]])
            fmut.write('%s\t%d\t%d\t%s\t%s\tSNP\n' %
                       (chrm, coord, coord + 1, source, target))
            chrmseq[coord] = target
            i += 1

        # sample insertion
        nIns = int(args.i * len(chrmseq) / 2)
        print("sampling %d insertion" % nIns)
        lambdaIns = 1
        lenIns = numpy.random.poisson(lambdaIns, nIns) + 1
        i = 0
        for coord in random.sample(original, 2 * nIns):
            if i >= nIns:
                break
            if insertion[coord] != '':  # already have insertion
                continue
            source = chrmseq[coord]
            if source == 'N':
                continue
            insertion[coord] = ''.join(
                numpy.random.choice(['A', 'T', 'C', 'G'], lenIns[i]))
            fmut.write(
                '%s\t%d\t%d\t%s\t%s\tINS\n' %
                (chrm, coord, coord + 1, source, source + insertion[coord]))
            i += 1

        # sample deletion
        nDel = int(args.i * len(chrmseq) / 2)
        print("sampling %d deletions" % nDel)
        lambdaDel = 1
        lenDel = numpy.random.poisson(lambdaDel, nDel) + 1
        i = 0
        for coord in random.sample(original, 2 * nDel):
            if i >= nDel:
                break

            source = chrmseq[coord]
            if source == 'N':
                continue

            toDelete = True
            for j in xrange(lenDel[i]):
                if insertion[coord + j] != '' or deletion[
                        coord + j]:  # avoid mixing insertion with deletion
                    toDelete = False
            if not toDelete:
                continue

            delSeq = ''.join(
                [chrmseq[coord + j + 1] for j in xrange(lenDel[i])])
            for j in xrange(lenDel[i]):
                deletion[coord + j + 1] = True
            fmut.write(
                '%s\t%d\t%d\t%s\t%s\tDEL\n' %
                (chrm, coord, coord + lenDel[i] + 1, source + delSeq, source))
            i += 1

        chrmseq1 = ''.join([(b + insertion[i]) if insertion[i] != '' else b
                            for i, b in enumerate(chrmseq) if not deletion[i]])
        coords = []
        for i, b in enumerate(chrmseq):
            if not deletion[i]:
                coords.append(i)
                if insertion[i] != '':
                    coords.extend([-1] * len(insertion[i]))

        # the original coordinates
        dump(coords, open(args.prefix + '/' + chrm + '.coords.pkl', 'w'))

        # chromseq
        # chrmseq0 = ''.join(chrmseq)
        dump(chrmseq1, open(args.prefix + '/' + chrm + '.chrmseq.pkl', 'w'))

    fmut.close()
Ejemplo n.º 19
0
import wzcore
import faidx

refgenomehg19 = faidx.RefGenome('/Users/zhouw3/references/hg19/hg19.fa')


class GeneticElement(object):
    def __init__(self, flank=1000, flank1=None, flank2=None):
        self.flank1 = flank  # upstream (gene order)
        self.flank2 = flank  # downstream (gene order)
        if flank1:
            self.flank1 = flank1
        if flank2:
            self.flank2 = flank2

    def __len__(self):
        return self.end - self.beg

    def __repr__(self):

        return '<Element: %s:%d-%d>' % (self.chrm, self.beg, self.end)

    def rep_coord(self):

        return '%s:%d-%d' % (self.chrm, self.beg, self.end)

    def seq_beg(self):
        """ note that beg, end, seq_beg(), seq_end() are all in reference coordinates """
        if self.strand == '+':
            return self.beg - self.flank1
        else: