예제 #1
0
파일: find_msp_sites.py 프로젝트: cwng/RRBS
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False):
    start=-3
    hang='NNN'

    match=[]

    #find CCGG positions using Fasta file
    fa=open(fafile)
    for line in fa:
        l=line.strip('\n')
        if l[0]=='>':
            ch=l[1:]
            continue
        if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN':
            start+=len(l)
            hang=l[-3:]
            continue
        else:
            seq=hang+l
            mers=[seq[x:(x+4)] for x in range(len(seq)-4)]
            for i,m in enumerate(mers):
                if m=='ccgg': match.append(start+i)
            hang=seq[-3:]
            start+=len(l)

    print len(match)
    
    fa.close()
    FRAG=[]
    
    #find cut sites 40-220bp and save as tuple
    for x,y in zip(match[:-1],match[1:]):
        d=y-x
        if d>40 and d<250: FRAG.append((x,y))

    print len(FRAG)

    #nibDB the cut sites 40bp 5'-3' and
    #save each as a pair of Fasta items with keys chr:position(strand)
    seq_dict={}
    ids,loci=[],[]
    BF=[]
    for x,y in FRAG:
        if bedFrag: BF.append([ch,str(x+1),str(y+3)])
        #for x
        start=x+1
        stop=x+41
        key=ch+':'+str(start)+'+'
        loc=(ch,start,stop,'+')
        ids.append(key)
        loci.append(loc)
        
        #for y
        start=y-37
        stop=y+3
        key=ch+':'+str(stop)+'-'
        loc=(ch,start,stop,'-')
        ids.append(key)
        loci.append(loc)

    if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t')
    if genome=='hg18':  DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/')
    else:  DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir'])
    fa_ids,seqs=DB.get_fasta_batch(loci)
    for id,seq in zip(ids,seqs):
        if convert: biseq=seq.replace('c','t')
        else: biseq=seq
        if id[-1]=='+':
            seq_dict[id]=biseq
        else:
            #seq_dict[id]=seq[::-1]
            seq_dict[id]=biseq
    Fasta.write(seq_dict,seqfile)
예제 #2
0
    if opts.gene_list :
        gene_list = [x.strip() for x in open(opts.gene_list).readlines()]

    id_index = 'bin'
    if opts.gene_type != gene_type_choices[0] :
        if opts.gene_type  == 'refgene' :
            id_index = 'name'

    seq_recs = []
    gene_map = defaultdict(list)
    for rec in refgene_f :
        if gene_list and rec[id_index] not in gene_list : continue # skip this one
        st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases'])
        key = (rec['chrom'],st,end,rec['strand'])
        seq_recs.append(key)
        gene_map[key[:-1]].append(rec['bin']+'/'+rec['name'])

    fasta_recs = nib_db.get_fasta_batch(seq_recs)

    out_f = open(opts.output,'w') if opts.output else sys.stdout
    header_regex = re.compile('^.*(chr[0-9MXY]+).*:([0-9]+)-([0-9]+).*$')
    for header, seq in zip(*fasta_recs) :
        # map sequences back to gene names using the header
        reg_obj = header_regex.search(header)
        if reg_obj is not None :
            chrm,st,end = reg_obj.groups()
            gene_names = gene_map.get((chrm,int(st),int(end)))
            if gene_names is not None :
                header = header.strip()+':'+','.join(gene_names)+'\n'
        out_f.write(header+seq+'\n')
예제 #3
0
            num_to_sample = int(sample_percent*(end_i-st_i))
            inds_to_sample = random.sample(xrange(st_i,end_i),num_to_sample)

            # we memoize the sequences we've seen before so we don't fetch seqs
            # unnecessarily
            unmemoed_inds_to_sample = set(inds_to_sample).difference(set(pval_bin_memo.keys()))

            bin_fasta_batch = []
            for peak_i in unmemoed_inds_to_sample :
                bin_fasta_batch.append((str(all_peaks[peak_i]['chr']),
                                        int(all_peaks[peak_i]['start']),
                                        int(all_peaks[peak_i]['end']),
                                        '+'))

            if len(bin_fasta_batch) != 0 :
                bin_headers, bin_seq = nibDb.get_fasta_batch(bin_fasta_batch)

                for i, ind in enumerate(unmemoed_inds_to_sample) :
                    pval_bin_memo[ind] = bin_seq[i].upper()

            # score the sequences
            pval_bin_pvals.append([])
            for ind in inds_to_sample :
                max_score = m.bestscan(pval_bin_memo[ind])
                max_score = (max_score-m.minscore)/(m.maxscore-m.minscore)
                pval_bin_pvals[-1].append(max_score)
            pval_bin_pvals[-1] = np.array(pval_bin_pvals[-1])


        mp.figure(figsize=(4,4))
        font = {'size':'9'}