def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False): start=-3 hang='NNN' match=[] #find CCGG positions using Fasta file fa=open(fafile) for line in fa: l=line.strip('\n') if l[0]=='>': ch=l[1:] continue if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN': start+=len(l) hang=l[-3:] continue else: seq=hang+l mers=[seq[x:(x+4)] for x in range(len(seq)-4)] for i,m in enumerate(mers): if m=='ccgg': match.append(start+i) hang=seq[-3:] start+=len(l) print len(match) fa.close() FRAG=[] #find cut sites 40-220bp and save as tuple for x,y in zip(match[:-1],match[1:]): d=y-x if d>40 and d<250: FRAG.append((x,y)) print len(FRAG) #nibDB the cut sites 40bp 5'-3' and #save each as a pair of Fasta items with keys chr:position(strand) seq_dict={} ids,loci=[],[] BF=[] for x,y in FRAG: if bedFrag: BF.append([ch,str(x+1),str(y+3)]) #for x start=x+1 stop=x+41 key=ch+':'+str(start)+'+' loc=(ch,start,stop,'+') ids.append(key) loci.append(loc) #for y start=y-37 stop=y+3 key=ch+':'+str(stop)+'-' loc=(ch,start,stop,'-') ids.append(key) loci.append(loc) if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t') if genome=='hg18': DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/') else: DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir']) fa_ids,seqs=DB.get_fasta_batch(loci) for id,seq in zip(ids,seqs): if convert: biseq=seq.replace('c','t') else: biseq=seq if id[-1]=='+': seq_dict[id]=biseq else: #seq_dict[id]=seq[::-1] seq_dict[id]=biseq Fasta.write(seq_dict,seqfile)
if opts.gene_list : gene_list = [x.strip() for x in open(opts.gene_list).readlines()] id_index = 'bin' if opts.gene_type != gene_type_choices[0] : if opts.gene_type == 'refgene' : id_index = 'name' seq_recs = [] gene_map = defaultdict(list) for rec in refgene_f : if gene_list and rec[id_index] not in gene_list : continue # skip this one st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases']) key = (rec['chrom'],st,end,rec['strand']) seq_recs.append(key) gene_map[key[:-1]].append(rec['bin']+'/'+rec['name']) fasta_recs = nib_db.get_fasta_batch(seq_recs) out_f = open(opts.output,'w') if opts.output else sys.stdout header_regex = re.compile('^.*(chr[0-9MXY]+).*:([0-9]+)-([0-9]+).*$') for header, seq in zip(*fasta_recs) : # map sequences back to gene names using the header reg_obj = header_regex.search(header) if reg_obj is not None : chrm,st,end = reg_obj.groups() gene_names = gene_map.get((chrm,int(st),int(end))) if gene_names is not None : header = header.strip()+':'+','.join(gene_names)+'\n' out_f.write(header+seq+'\n')
num_to_sample = int(sample_percent*(end_i-st_i)) inds_to_sample = random.sample(xrange(st_i,end_i),num_to_sample) # we memoize the sequences we've seen before so we don't fetch seqs # unnecessarily unmemoed_inds_to_sample = set(inds_to_sample).difference(set(pval_bin_memo.keys())) bin_fasta_batch = [] for peak_i in unmemoed_inds_to_sample : bin_fasta_batch.append((str(all_peaks[peak_i]['chr']), int(all_peaks[peak_i]['start']), int(all_peaks[peak_i]['end']), '+')) if len(bin_fasta_batch) != 0 : bin_headers, bin_seq = nibDb.get_fasta_batch(bin_fasta_batch) for i, ind in enumerate(unmemoed_inds_to_sample) : pval_bin_memo[ind] = bin_seq[i].upper() # score the sequences pval_bin_pvals.append([]) for ind in inds_to_sample : max_score = m.bestscan(pval_bin_memo[ind]) max_score = (max_score-m.minscore)/(m.maxscore-m.minscore) pval_bin_pvals[-1].append(max_score) pval_bin_pvals[-1] = np.array(pval_bin_pvals[-1]) mp.figure(figsize=(4,4)) font = {'size':'9'}