if __name__ == '__main__' : # parse command line arguments opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 3 : parser.error('Must provide two non-option arguments') # filenames and paths organism, experiment_fn, control_fn = args[0:3] control_fn = None if len(args) > 3 : control_fn = args[2] org_settings = get_org_settings(organism) refseq_fn = org_settings['annotation_path'] exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) if control_fn : cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) # the pipeline pipeline = Pypeline() steps = [] # split up files
mp.subplots_adjust(**subplots_sizes) mp.hist(peak_stats['fdr'],label=hist_labels[0],bins=20,log=True) mp.title('%s\npeak fdr distribution'%macs_f.file_info['name']) mp.xlabel('fdr') mp.ylabel('# peaks') mp.legend() mp.savefig(fdr_hist_fn) mp.clf() chr_dist_name = macs_f.file_info['name']+'_chr_dist.png' chr_dist_fn = os.path.join(infosite_img_path,chr_dist_name) chr_dist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+chr_dist_name peak_json[peak_fn]['chr distribution url'] = chr_dist_url chromos = [] if json_d.has_key('org') : chr_sizes_fn = get_org_settings(json_d['org'])['ucsc_chrom_sizes'] chromos = [r[0] for r in reader(open(chr_sizes_fn),delimiter='\t')] else : chromos = list(set(pos_chr_dist.keys()).union(neg_chr_dist.keys())) standard_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is not None,chromos) # hack chrM, chrX and chrY so they sort right if 'chrM' in standard_chromos : standard_chromos[standard_chromos.index('chrM')] = 'chr100' if 'chrX' in standard_chromos : standard_chromos[standard_chromos.index('chrX')] = 'chr101' if 'chrY' in standard_chromos : standard_chromos[standard_chromos.index('chrY')] = 'chr102' standard_chromos.sort(key=lambda x: int(x.replace('chr','')))
def rejection_sample_bg(fg_dict, organism, bins=100, num_samples=None, verbose=False, bg_match_epsilon=1e-3): '''Generate background sequences according to the size, distance from genes, and GC content distributions of the supplied foreground sequences. *fg_dict* is a dictionary of <header>:<sequence> items, where the first part of the header must contain: >chrX:<start>-<end> *organism* is a string that will be used to call the *chipsequtil.get_org settings* function and uses the 'genome_dir' and 'annotation_path' keys. *bins* is the number of bins to use for representing the GC content distribution. Function returns a dictionary of <header>:<sequence> items of generated background sequences.''' nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']]) tss_fn = get_org_settings(organism)['annotation_path'] tss = defaultdict(list) for rec in RefGeneFile(tss_fn): tss[rec['chrom']].append(( int(rec['txStart']), int(rec['txEnd']), )) # for each peak find the chromosome, distance to nearest # gene, size of peaks in bases, and GC content num_samples = len(fg_dict) if not num_samples else num_samples dists, sizes = [], [] for header, seq in fg_dict.items(): # chromosome first field in fasta headers from bed2seq.bedtoseq chrom = header.split(':')[0] # adjust chromosomes in special cases if re.search('random', chrom.lower()) or chrom.lower() == 'chrm': continue # start first int in second field of bed2seq.bedtoseq header start = int(header.split(':')[1].split('-')[0]) midpoint = start + len(seq) / 2 # figure out which chromosome we're working on tss_chr = tss[chrom] # dsts_to_genes is the distance of this peak from all the genes, find minimum dists_to_genes = [(s[0] - midpoint) for s in tss_chr] try: min_dist = min(dists_to_genes, key=lambda x: abs(x)) dists.append(min_dist) except: err_str = 'Warning: no genes were found for sequence with header' \ ' %s, not using to calculate distributions.\n'%header sys.stderr.write(err_str) # calculate # bases sizes.append(len(seq)) # GC content distribution for the foreground sequences gc_dist = get_gc_content_distribution(fg_dict.values(), bins=bins) # max_gc is # peaks w/ highest GC content max_gc = max(gc_dist) # gene_starts is a list of all genes in (chromosome,gene start) tuples gene_starts = [] for key in tss.keys(): chrom = key.split('chr')[-1] for x in tss[key]: gene_starts.append((key, x[0])) # encapsulated function for proposing sequences def propose_sequence(dists, gene_starts, sizes, nib_db): # sample a random distance from the list of distances d = random.choice(dists) # pick a random gene chrom, coord = random.choice(gene_starts) # propose a starting point for the bg sequence midpoint = coord - d + random.randint(-100, 100) # propose a size for the bg sequence size = random.choice(sizes) start = int(midpoint - int(size / 2)) stop = int(midpoint + int(size / 2)) #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d)) # if start or stop are negative, skip and try again if start < 0 or stop < 0: seq = None # randomly choose strand strand = '+' if random.random() > 0.5 else '-' # extract the proposed sequence try: nib_title, seq = nib_db.get_fasta(chrom, start, stop, strand) except IOError, e: if verbose: sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n' % (chrom, start, stop, strand)) seq = None except NibException, e: if verbose: sys.stderr.write('NibDB.get_fasta error, %s\n' % e) seq = None
';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()]) fasta.append((header,seq)) return fasta if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 2 : parser.error('Must provide at least two non-option arguments') # instantiate the NibDB from the provided directory organism = args[0] nib_dir = get_org_settings(organism)['genome_dir'] nib_db = NibDB(nib_dirs=[nib_dir]) # determine specified format peak_fmt = opts.peak_format peak_fns = args[1:] # determine if there is an output file if opts.output : out_f = open(opts.output,'w') else : out_f = sys.stdout fasta_recs = [] for peak_fn in peak_fns :
help='file containing a list of gene identifiers to extract, one per line [default: %default]') gene_type_choices = ['symbol','refgene'] parser.add_option('-t','--gene-type',dest='gene_type',type='choice', choices=gene_type_choices,default=gene_type_choices[0], help='type of gene identifier in gene list, choose from %s [default: %%default]'%gene_type_choices) parser.add_option('-o','--output',dest='output',default=None, help='file to write fasta records to [default: stdout]') if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) != 1 : parser.error('Exactly one argument is required') org_settings = get_org_settings(args[0]) refgene_fn = org_settings['refgene_anno_path'] refgene_f = RefGeneFile(refgene_fn) nib_db = NibDB(nib_dirs=[org_settings['genome_dir']]) gene_list = None if opts.gene_list : gene_list = [x.strip() for x in open(opts.gene_list).readlines()] id_index = 'bin' if opts.gene_type != gene_type_choices[0] : if opts.gene_type == 'refgene' : id_index = 'name'
def rejection_sample_bg(fg_dict,organism,bins=100,num_samples=None,verbose=False, bg_match_epsilon=1e-3) : '''Generate background sequences according to the size, distance from genes, and GC content distributions of the supplied foreground sequences. *fg_dict* is a dictionary of <header>:<sequence> items, where the first part of the header must contain: >chrX:<start>-<end> *organism* is a string that will be used to call the *chipsequtil.get_org settings* function and uses the 'genome_dir' and 'annotation_path' keys. *bins* is the number of bins to use for representing the GC content distribution. Function returns a dictionary of <header>:<sequence> items of generated background sequences.''' nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']]) tss_fn = get_org_settings(organism)['annotation_path'] tss = defaultdict(list) for rec in RefGeneFile(tss_fn) : tss[rec['chrom']].append((int(rec['txStart']),int(rec['txEnd']),)) # for each peak find the chromosome, distance to nearest # gene, size of peaks in bases, and GC content num_samples = len(fg_dict) if not num_samples else num_samples dists,sizes=[],[] for header,seq in fg_dict.items() : # chromosome first field in fasta headers from bed2seq.bedtoseq chrom = header.split(':')[0] # adjust chromosomes in special cases if re.search('random',chrom.lower()) or chrom.lower() == 'chrm' : continue # start first int in second field of bed2seq.bedtoseq header start = int(header.split(':')[1].split('-')[0]) midpoint = start + len(seq)/2 # figure out which chromosome we're working on tss_chr = tss[chrom] # dsts_to_genes is the distance of this peak from all the genes, find minimum dists_to_genes = [(s[0]-midpoint) for s in tss_chr] try : min_dist = min(dists_to_genes,key=lambda x : abs(x)) dists.append(min_dist) except : err_str = 'Warning: no genes were found for sequence with header' \ ' %s, not using to calculate distributions.\n'%header sys.stderr.write(err_str) # calculate # bases sizes.append(len(seq)) # GC content distribution for the foreground sequences gc_dist = get_gc_content_distribution(fg_dict.values(),bins=bins) # max_gc is # peaks w/ highest GC content max_gc = max(gc_dist) # gene_starts is a list of all genes in (chromosome,gene start) tuples gene_starts=[] for key in tss.keys(): chrom=key.split('chr')[-1] for x in tss[key]: gene_starts.append((key,x[0])) # encapsulated function for proposing sequences def propose_sequence(dists, gene_starts, sizes, nib_db) : # sample a random distance from the list of distances d = random.choice(dists) # pick a random gene chrom, coord = random.choice(gene_starts) # propose a starting point for the bg sequence midpoint = coord-d+random.randint(-100,100) # propose a size for the bg sequence size = random.choice(sizes) start = int(midpoint-int(size/2)) stop = int(midpoint+int(size/2)) #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d)) # if start or stop are negative, skip and try again if start < 0 or stop < 0 : seq = None # randomly choose strand strand = '+' if random.random() > 0.5 else '-' # extract the proposed sequence try : nib_title, seq = nib_db.get_fasta(chrom,start,stop,strand) except IOError, e : if verbose : sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n'%(chrom,start,stop,strand)) seq = None except NibException, e : if verbose : sys.stderr.write('NibDB.get_fasta error, %s\n'%e) seq = None
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False): start=-3 hang='NNN' match=[] #find CCGG positions using Fasta file fa=open(fafile) for line in fa: l=line.strip('\n') if l[0]=='>': ch=l[1:] continue if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN': start+=len(l) hang=l[-3:] continue else: seq=hang+l mers=[seq[x:(x+4)] for x in range(len(seq)-4)] for i,m in enumerate(mers): if m=='ccgg': match.append(start+i) hang=seq[-3:] start+=len(l) print len(match) fa.close() FRAG=[] #find cut sites 40-220bp and save as tuple for x,y in zip(match[:-1],match[1:]): d=y-x if d>40 and d<250: FRAG.append((x,y)) print len(FRAG) #nibDB the cut sites 40bp 5'-3' and #save each as a pair of Fasta items with keys chr:position(strand) seq_dict={} ids,loci=[],[] BF=[] for x,y in FRAG: if bedFrag: BF.append([ch,str(x+1),str(y+3)]) #for x start=x+1 stop=x+41 key=ch+':'+str(start)+'+' loc=(ch,start,stop,'+') ids.append(key) loci.append(loc) #for y start=y-37 stop=y+3 key=ch+':'+str(stop)+'-' loc=(ch,start,stop,'-') ids.append(key) loci.append(loc) if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t') if genome=='hg18': DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/') else: DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir']) fa_ids,seqs=DB.get_fasta_batch(loci) for id,seq in zip(ids,seqs): if convert: biseq=seq.replace('c','t') else: biseq=seq if id[-1]=='+': seq_dict[id]=biseq else: #seq_dict[id]=seq[::-1] seq_dict[id]=biseq Fasta.write(seq_dict,seqfile)
from chipsequtil import get_org_settings, BEDFile from chipsequtil.nib import NibDB from pprint import pprint genome_dir = get_org_settings('mm9')['genome_dir'] db = NibDB(nib_dirs=[genome_dir]) fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed') pprint(seqs[:10])
num_peak_bases = 0 for header, seq in fg.items() : num_peak_bases += len(seq) if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 3 : parser.error('Must provide three non-option arguments') sample_type, organism, fg_fn = args[:3] settings_dict = get_org_settings(organism) fg = Fasta.load(fg_fn) bg = rejection_sampling(fg,settings_dict) ############################################################### # start Chris' code from rej_samp_bg_rand2.py the_genes={} #list of distances to nearest TSS # for each peak find the chromosome, distance to nearest # gene, size of peaks in bases, and GC content the_chrs,dists,sizes,gcs=[],[],[],[] # number of bases in the fg sequences size=0
# for pvalue vs motif score pval_num_bins = 20 pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins # try to take at least 100 sequences, at most 10% of bin size sample_percent = max(min(1.,100./pval_bin_size),0.1) pval_bin_memo = {} if opts.top_n is not None : peaks = all_peaks[0:opts.top_n] peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n] else : peaks = all_peaks # extract fasta sequences for these peaks nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir']) """ # get the peak sequences sys.stderr.write('Getting peak sequences\n') fasta_batch = [] for i in range(peaks.size) : fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+')) fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch) # need a dict for background sampling # headers have genome_dir and .nib in them, strip that out sys.stderr.write('Converting nib output to dict\n') fg_fasta_headers = list(fg_fasta_headers) fg_fasta_dict = {} for h,s in zip(fg_fasta_headers,fg_fasta) :