parser.add_option('-o','--output',dest='output',default=None, help='file to write fasta records to [default: stdout]') if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) != 1 : parser.error('Exactly one argument is required') org_settings = get_org_settings(args[0]) refgene_fn = org_settings['refgene_anno_path'] refgene_f = RefGeneFile(refgene_fn) nib_db = NibDB(nib_dirs=[org_settings['genome_dir']]) gene_list = None if opts.gene_list : gene_list = [x.strip() for x in open(opts.gene_list).readlines()] id_index = 'bin' if opts.gene_type != gene_type_choices[0] : if opts.gene_type == 'refgene' : id_index = 'name' seq_recs = [] gene_map = defaultdict(list) for rec in refgene_f : if gene_list and rec[id_index] not in gene_list : continue # skip this one st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases'])
from chipsequtil import get_org_settings, BEDFile from chipsequtil.nib import NibDB from pprint import pprint genome_dir = get_org_settings('mm9')['genome_dir'] db = NibDB(nib_dirs=[genome_dir]) fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') pprint(seqs[:10])
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False): start=-3 hang='NNN' match=[] #find CCGG positions using Fasta file fa=open(fafile) for line in fa: l=line.strip('\n') if l[0]=='>': ch=l[1:] continue if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN': start+=len(l) hang=l[-3:] continue else: seq=hang+l mers=[seq[x:(x+4)] for x in range(len(seq)-4)] for i,m in enumerate(mers): if m=='ccgg': match.append(start+i) hang=seq[-3:] start+=len(l) print len(match) fa.close() FRAG=[] #find cut sites 40-220bp and save as tuple for x,y in zip(match[:-1],match[1:]): d=y-x if d>40 and d<250: FRAG.append((x,y)) print len(FRAG) #nibDB the cut sites 40bp 5'-3' and #save each as a pair of Fasta items with keys chr:position(strand) seq_dict={} ids,loci=[],[] BF=[] for x,y in FRAG: if bedFrag: BF.append([ch,str(x+1),str(y+3)]) #for x start=x+1 stop=x+41 key=ch+':'+str(start)+'+' loc=(ch,start,stop,'+') ids.append(key) loci.append(loc) #for y start=y-37 stop=y+3 key=ch+':'+str(stop)+'-' loc=(ch,start,stop,'-') ids.append(key) loci.append(loc) if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t') if genome=='hg18': DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/') else: DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir']) fa_ids,seqs=DB.get_fasta_batch(loci) for id,seq in zip(ids,seqs): if convert: biseq=seq.replace('c','t') else: biseq=seq if id[-1]=='+': seq_dict[id]=biseq else: #seq_dict[id]=seq[::-1] seq_dict[id]=biseq Fasta.write(seq_dict,seqfile)
# for pvalue vs motif score pval_num_bins = 20 pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins # try to take at least 100 sequences, at most 10% of bin size sample_percent = max(min(1.,100./pval_bin_size),0.1) pval_bin_memo = {} if opts.top_n is not None : peaks = all_peaks[0:opts.top_n] peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n] else : peaks = all_peaks # extract fasta sequences for these peaks nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir']) """ # get the peak sequences sys.stderr.write('Getting peak sequences\n') fasta_batch = [] for i in range(peaks.size) : fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+')) fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch) # need a dict for background sampling # headers have genome_dir and .nib in them, strip that out sys.stderr.write('Converting nib output to dict\n') fg_fasta_headers = list(fg_fasta_headers) fg_fasta_dict = {} for h,s in zip(fg_fasta_headers,fg_fasta) :