Ejemplo n.º 1
0
def makeseq(chromlen, binlength, species):
    x = cis.interval(genome=species)
    x.chrom = []
    x.start = []
    x.end = []
    #    x.strand = []
    inf = open(chromlen)  #open("/mnt/data/static_libraries/chromLen/hg19.len")
    #outf = open("hg19_%s_bin.bed"%(binlength),'w')
    for line in inf:
        start = 1
        end = int(binlength)
        while end < int(line.split()[1]):
            #    print start,end
            #outf.write("\t".join([line.split()[0],str(start),str(end)])+"\n")
            x.chrom.append(line.split()[0])
            x.start.append(max(1, start - 100))
            #print line
            x.end.append(end + 100)
            #            x.end.append(min(end+100,int(line.split()[1])))
            start += int(binlength)
            end += int(binlength)
        x.chrom.append(line.split()[0])
        x.start.append(start)
        x.end.append(int(line.split()[1]))
    x.getSequence()
    return x
Ejemplo n.º 2
0
def makeseq(bedfile,species):
    x = cis.interval(genome=species)
    x.chrom = []
    x.start = []
    x.end = []
#    x.strand = []
    inf = open(bedfile)#open("/mnt/data/static_libraries/chromLen/hg19.len")
    #outf = open("hg19_%s_bin.bed"%(binlength),'w')
    for line in inf:
        #start = 1
        #end = int(binlength)
        #while end < int(line.split()[1]):
        #    print start,end
            #outf.write("\t".join([line.split()[0],str(start),str(end)])+"\n")
        x.chrom.append(line.split()[0])
        x.start.append(int(line.split()[1]))
        x.end.append(int(line.split()[2]))
        #    start += int(binlength)
        #    end += int(binlength)
        #x.chrom.append(line.split()[0])
        #x.start.append(start)
        #x.end.append(int(line.split()[1]))
    x.getSequence()
    return x
Ejemplo n.º 3
0
def getsignal(inputfile,
              outputfile,
              BGmatrix,
              pcut,
              ncut,
              pspan,
              fetch_length=100,
              gen='hg19'):

    #   p=BwIO(pcut)
    #   chrom_len = {}
    #   for i in p.chromosomeTree['nodes']:
    #       chrom_len[i['key']] = i['chromSize']
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    inf = open(inputfile)
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    pspan = pspan - ml / 2
    inf.seek(0)
    X = c.interval(genome=gen)
    X.chrom, X.start, X.end, X.val = [], [], [], []
    pBG, nBG = readBG(BGmatrix)
    for line in inf:
        ll = line.split()
        #       if not chrom_len.has_key(ll[0]):
        #           continue

        X.chrom.append(ll[0])
        X.start.append(int(ll[1]) - pspan - 3 + 1)
        X.end.append(int(ll[2]) + pspan + 3 + 1)
        X.val.append(ll[5])
    inf.close()
    X.getSequence()

    outf = open(outputfile, 'w')
    for i, elem in enumerate(X.seq):

        pchrm = X.chrom[i]
        pstart = X.start[i] - 1 + 3 + pspan
        pend = X.end[i] - 1 - 3 - pspan
        seq = X.seq[i]
        strand = X.val[i]
        pll = [pchrm, pstart, pend, strand]
        pout = make_cut(pcutbw, pll, pspan, fetch_length)
        nout = make_cut(ncutbw, pll, pspan, fetch_length)
        if pll[3] == "-":
            pout, nout = nout, pout
        if pout == 'NA':
            continue

        if 'N' in seq.upper():
            continue
        #print 1
        pseq = seq[:-1]
        nseq = seq[1:]
        p = []
        n = []
        for k in range(len(pseq) + 1 - 6):
            p.append(pBG[pseq[k:k + 6].upper()])
            n.append(nBG[nseq[k:k + 6].upper()])
        if strand != '-':
            pbglist = p
            nbglist = n
        else:
            pbglist = n[::-1]
            nbglist = p[::-1]
    #print nbglist
        newll = [pchrm, pstart, pend, strand] + pout + nout + pbglist + nbglist
        #print len(pout),len(nout),len(pbglist),len(nbglist),len(newll)
        outf.write("\t".join(map(str, newll)) + "\n")

    outf.close()
Ejemplo n.º 4
0
def profile(inputfile,outputfile,pattern,strand):
    x = cis.interval(genome='hg19')
    x.chrom=[]
    x.start=[]
    x.end=[]
    inf = open(inputfile)
    for line in inf:
        ll = line.split()
        x.chrom.append(ll[0])
        x.start.append(int(ll[1])-30)
        x.end.append(int(ll[1])+2+31)
    x.getSequence()
    scores = []
    for i in range(len(x.start)):
        score = []
        s = string.upper(x.seq[i])
        if strand == "+":
            for j in range(len(s)-2):
                if s[j] == "A":
                    p1=0
                elif s[j] == "C":
                    p1=1
                elif s[j] == "G":
                    p1=2
                elif s[j] == "T":
                    p1=3
                else:
                    break
                if s[j+1] == "A":
                    p2=0
                elif s[j+1] == "C":
                    p2=1
                elif s[j+1] == "G":
                    p2=2
                elif s[j+1] == "T":
                    p2=3
                else:
                    break
                prob = pattern[p1][p2]
                score.append(prob)
        elif strand == "-":
            for j in range(1,len(s)-1):
                if s[j+1]=="A":
                    p1=3
                elif s[j+1] == "C":
                    p1=2
                elif s[j+1] == "G":
                    p1=1
                elif s[j+1] == "T":
                    p1=0
                else:
                    break
                if s[j] == "A":
                    p2=3
                elif s[j] == "C":
                    p2=2
                elif s[j] == "G":
                    p2=1
                elif s[j] == "T":
                    p2=0
                else:
                    break
                prob = pattern[p1][p2]
                score.append(prob)
        scores.append(score)
    inf.close()
    outf = open(outputfile,'w')
    for score in scores:
        if len(score)!=61:
            continue
        outf.write("\t".join(map(str,score))+"\n")
    outf.close()
Ejemplo n.º 5
0
def profile(inputfile, outputfile, mode, strand):
    A1 = [0] * 4  ##[ACGT]
    C1 = [0] * 4
    G1 = [0] * 4
    T1 = [0] * 4
    x = cis.interval(genome='hg19')
    x.chrom = []
    x.start = []
    x.end = []
    inf = open(inputfile)
    for line in inf:
        ll = line.split()
        x.chrom.append(ll[0])
        if mode == "peak":
            x.start.append(int(ll[1]))
            x.end.append(int(ll[2]) + 2)
        elif mode == "cut":
            if strand == "+":
                x.start.append(int(ll[1]))
                x.end.append(int(ll[1]) + 2)
            else:
                x.start.append(int(ll[2]))
                x.end.append(int(ll[2]) + 2)
        else:
            print "mode wrong, only peak,cut availabe"
            exit()
    x.getSequence()
    for i in range(len(x.start)):
        s = string.upper(x.seq[i])
        if strand == "+":
            for j in range(len(s) - 1):
                if s[j + 1] == "A":
                    p1 = 0
                elif s[j + 1] == "C":
                    p1 = 1
                elif s[j + 1] == "G":
                    p1 = 2
                elif s[j + 1] == "T":
                    p1 = 3
                else:
                    continue
                if s[j] == "A":
                    A1[p1] += 1
                elif s[j] == "C":
                    C1[p1] += 1
                elif s[j] == "G":
                    G1[p1] += 1
                elif s[j] == "T":
                    T1[p1] += 1
                else:
                    continue
        elif strand == "-":
            s = s[::-1]
            for j in range(len(s) - 1):
                if s[j + 1] == "A":
                    p1 = 3
                elif s[j + 1] == "C":
                    p1 = 2
                elif s[j + 1] == "G":
                    p1 = 1
                elif s[j + 1] == "T":
                    p1 = 0
                else:
                    continue
                if s[j] == "A":
                    T1[p1] += 1
                elif s[j] == "C":
                    G1[p1] += 1
                elif s[j] == "G":
                    C1[p1] += 1
                elif s[j] == "T":
                    A1[p1] += 1
                else:
                    continue
        else:
            print "strand only + and - "
            exit()
    inf.close()
    outf = open(outputfile, 'w')
    #    outf.write("\t".join(['P','A','C','G','T'])+"\n")
    #    outf.write("\t".join(map(str,['A']+A1))+"\n")
    #    outf.write("\t".join(map(str,['C']+C1))+"\n")
    #    outf.write("\t".join(map(str,['G']+G1))+"\n")
    #    outf.write("\t".join(map(str,['T']+T1))+"\n")
    outf.write("\t".join(map(str, A1)) + "\n")
    outf.write("\t".join(map(str, C1)) + "\n")
    outf.write("\t".join(map(str, G1)) + "\n")
    outf.write("\t".join(map(str, T1)) + "\n")

    #    outf.write("total\t"+str(sum(A1)+sum(C1)+sum(G1)+sum(T1))+"\n")
    outf.close()
Ejemplo n.º 6
0
def read_file(fp, mintags=0, maxtags=10, maxlines=100, select=100):
    """
    Parse data
    """

    # TODO read in more general format
    if 1:
        CHR, START, END, NAME, SCORE, STRAND, COUNT50P, COUNT50M = 0, 1, 2, 3, 4, 5, 6, 7
        CUT_START = 14
        CUT_END = 414
    else:
        CHR, START, END, NAME, SCORE, COUNT50P, COUNT50M = 0, 1, 2, 3, 4, 5, 6
        STRAND = None
        CUT_START = 13
        CUT_END = 413

    k = 0
    X = c.interval(genome='hg19')
    X.chrom, X.start, X.end, X.strand, X.name, X.val = [], [], [], [], [], []
    cuts_pos = []
    cuts_neg = []

    mid = int(0.5 * (CUT_END - CUT_START))
    startoffset = (mid - int(0.5 * select))
    endoffset = (mid - int(0.5 * select) + select)

    for elem in fp.readlines():
        if elem[0:3] == 'chr':
            f = elem.split()
            chr, start, end, seq, motifscore, n50p, n50m = f[CHR], int(
                f[START]), int(f[END]), f[NAME], float(f[SCORE]), float(
                    f[COUNT50P]), float(f[COUNT50M])

            if STRAND:
                strand = f[STRAND]
            else:
                strand = '+'

            if (strand == '+') and (n50p + n50m >= mintags) and (n50p + n50m <
                                                                 maxtags):
                ## Shawn : only + motif included
                X.chrom.append(f[0])
                #X.start.append( start - (CUT_END-CUT_START)/2 )
                #X.end.append( start + (CUT_END-CUT_START)/2 )
                X.start.append(start - (CUT_END - CUT_START) / 2 + startoffset)
                X.end.append(start - (CUT_END - CUT_START) / 2 + endoffset)

                X.strand.append(strand)
                X.name.append(seq)
                X.val.append((motifscore, n50p + n50m))
                k += 1

                poscut = [float(z) for z in f[CUT_START:CUT_END]]
                negcut = [
                    float(z) for z in f[CUT_END:2 * CUT_END + 2 - CUT_START]
                ]
                ##Shawn : negcut = [ float(z) for z in f[ CUT_END: 2*CUT_END-CUT_START ] ]

                cuts_pos.append(poscut[startoffset:endoffset])
                cuts_neg.append(negcut[startoffset:endoffset])

        if k == maxlines:
            break

    X.getSequence()

    #for i,elem in enumerate( X.seq ):
    #    print X.name[i], elem[ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10  ], X.strand[i],  '\t'.join(  [ '%3.1f' % x for x in cuts_pos[i][ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ] ] )

    #X.seq[i]    = elem[ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10  ]
    #cuts_pos[i] = cuts_pos[i][ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ]

    #return
    return X, numpy.array(cuts_pos), numpy.array(cuts_neg)
Ejemplo n.º 7
0
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,pspan,fetch_length=100,gen='hg19'):

    
    p=BwIO(pcut)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    inf = open(inputfile)    
    pp=[]
    pm=[]
    X = c.interval(genome=gen)
    X.chrom,X.start,X.end,X.val = [],[],[],[]
    pBG,nBG = readBG(BGmatrix)
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        pout = make_cut(pcutbw,ll,pspan,fetch_length)
        nout = make_cut(ncutbw,ll,pspan,fetch_length)
        if ll[5] == "-":
            pout,nout = nout,pout
        if pout == 'NA':
            continue
        #print len(pout),len(nout),ll[:3]
        pp.append(pout)
        pm.append(nout)
        X.chrom.append(ll[0])
        X.start.append(int(ll[1])-pspan -3   + 1)
        X.end.append(int(ll[2]) + pspan +3   + 1)
        X.val.append(ll[5])
#total[ ( flength - span ) : ( flength + int(ll[2]) - int(ll[1]) + span ) ]

    meanp = apply_mean(pp)
    meanm = apply_mean(pm) 

    X.getSequence()
    
    pbglist = []
    nbglist = []
    for i,elem in  enumerate(X.seq):
        seq = X.seq[i]
        strand = X.val[i]
        if 'N' in seq.upper():
            continue
        pseq = seq[:-1]
        nseq = seq[1:]
        #if 'N' in pseq  or 'N' in nseq:
        #    continue
        p=[]
        n=[]
        for k in range(len(pseq)  +1 - 6):
            p.append(pBG[pseq[k:k+6].upper()])
            n.append(nBG[nseq[k:k+6].upper()])
        if strand != '-':
            pbglist.append(p)
            nbglist.append(n)
        else:
            pbglist.append(n[::-1])
            nbglist.append(p[::-1])
    #print nbglist
    meanpbglist = apply_mean(pbglist)
    meanmbglist = apply_mean(nbglist)        

    plot_template(meanp,meanm,meanpbglist,meanmbglist,outputfile)
Ejemplo n.º 8
0
def make_template(data, flank, pflank, topmotif, out, pbw, mbw, bgmatrix, gen):
    w_plus_H = BigWigFile(open(pbw, 'rb'))
    w_minus_H = BigWigFile(open(mbw, 'rb'))
    i = 0
    templatelist = []
    pp = []
    pm = []
    inf = open(data)
    l1st = inf.readline().split()
    ml = int(l1st[2]) - int(l1st[1])
    inf.seek(0)
    for line in inf:
        #if i >= topmotif:
        #   break
        ll = line.split()
        templatelist.append(ll)

    inf.close()
    templatelist.sort(key=lambda x: float(x[4]), reverse=True)

    ### for cut sitepro
    for ll in templatelist:
        p_sum = list(
            w_plus_H.summarize(ll[0],
                               int(ll[1]) - flank,
                               int(ll[1]) + flank, 2 * flank).sum_data)
        m_sum = list(
            w_minus_H.summarize(ll[0],
                                int(ll[1]) - flank,
                                int(ll[1]) + flank, 2 * flank).sum_data)
        if ll[5] == "+":
            pp.append(p_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 +
                                                           pflank)])
            pm.append(m_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 +
                                                           pflank)])
        if ll[5] == '-':
            pm.append(p_sum[::-1][(flank + 1 + ml / 2 - 1 - ml -
                                   pflank):(flank + 1 + ml / 2 - 1 - ml +
                                            pflank)])
            pp.append(m_sum[::-1][(flank + 1 + ml / 2 - 1 - ml -
                                   pflank):(flank + 1 + ml / 2 - 1 - ml +
                                            pflank)])
    print pp
    print pm
    meanp = apply_mean(pp)
    meanm = apply_mean(pm)
    allsum = sum(meanp) + sum(meanm)
    P = []
    M = []
    for i in range(len(meanp)):
        P.append(meanp[i])  #/allsum)
        M.append(meanm[i])  #/allsum)

### for seqbias bg
    pBG = {}
    nBG = {}
    inf = open(bgmatrix)
    for line in inf:
        ll = line.split()
        name = ll[0]
        pBG[name] = float(ll[1])
        nBG[name] = float(ll[2])
    inf.close()
    X = c.interval(genome=gen)
    X.chrom, X.start, X.end, X.val = [], [], [], []
    for ll in templatelist:
        X.chrom.append(ll[0])
        X.start.append(int(ll[1]) + 1 - flank)
        X.end.append(int(ll[1]) + 1 + flank)
        X.val.append(ll[5])
    X.getSequence()

    pbglist = []
    nbglist = []
    for i, elem in enumerate(X.seq):
        seq = X.seq[i]
        strand = X.val[i]
        if strand != '+' or 'N' in seq or 'n' in seq:
            continue
        pseq = seq[(flank + 1 + ml / 2 - pflank - 3):(flank + 1 + ml / 2 +
                                                      pflank + 2)]
        nseq = seq[(flank + 1 + ml / 2 - pflank - 2):(flank + 1 + ml / 2 +
                                                      pflank + 3)]
        #if 'N' in pseq  or 'N' in nseq:
        #    continue
        p = []
        n = []
        for k in range(len(pseq) + 1 - 6):
            p.append(pBG[pseq[k:k + 6].upper()])
            n.append(nBG[nseq[k:k + 6].upper()])
        pbglist.append(p)
        nbglist.append(n)

    #print pbglist
    #print nbglist
    meanpbglist = apply_mean(pbglist)
    meanmbglist = apply_mean(nbglist)
    allsum = sum(meanpbglist) + sum(meanmbglist)
    Plusbg = []
    Minusbg = []
    for i in range(len(meanpbglist)):
        Plusbg.append(meanpbglist[i])  #/allsum)
        Minusbg.append(meanmbglist[i])  #/allsum)

    plot_template(P, M, Plusbg, Minusbg, out)