def plot_GC(chr,tbx_gc,cp_vect,starts,ends): F=open("GC.txt",'w') cp_vect = cp_vect.astype(np.float64) var = get_windowed_variance(cp_vect,50) F.write("var\tgc\n") for i in xrange(50,starts.shape[0],101): s = starts[i-50] e = ends[i+50] gc = np.mean(np.array([float(l[3]) for l in tbx_gc.fetch(chr,s,e,parser=pysam.asTuple())])) if var[i] != 0: print >>F,"%f\t%f"%( var[i],gc ) exit(1)
def init_all_dists(self): self.get_indiv_window_dist() self.csum_all = np.cumsum(self.all_cps) self.mu_probDensByWidth = {} self.ll_probDensByWidth = {} self.half_width = 250 self.variance_vect = get_windowed_variance(self.all_cps, self.half_width) self.var_left=np.roll(self.variance_vect,self.half_width+1) self.var_right=np.roll(self.variance_vect,-self.half_width-1) """ var_left and var_right represent at position k, the variance of the 2*half_width+1 windows to the right and the left """ self.get_variance_dist() self.null_var = np.var(self.all_cps) self.null_mu = np.mean(self.all_cps) self.initialized = True
def __init__(self,chr,cp_data, starts, ends, cutoff_scale, **kwargs): max_merge=kwargs.get("max_merge",0.5) use_means=kwargs.get("use_means",False) n_scales=kwargs.get("n_scales",51) #n_scales=kwargs.get("n_scales",30) scale_width=kwargs.get('scale_width',1) n_bin_smoothings=kwargs.get('-n_bin_smoothings',0) smoothing_kernel=kwargs.get('smoothing_kernel',np.array([1,2,1])) self.chr = chr self.cutoff_scale = cutoff_scale self.scales = list(np.arange(1,n_scales,scale_width)) self.starts = starts self.ends = ends self.n_wnds = self.starts.shape[0] self.cp_data = cp_data self.der1=np.zeros((len(self.scales),self.n_wnds),dtype=np.float32) self.der2=np.zeros((len(self.scales),self.n_wnds),dtype=np.int8) self.vars = get_windowed_variance(cp_data.astype(np.float64),500) self.l_vars = np.roll(self.vars,501) self.r_vars = np.roll(self.vars,-501) print >>stderr, "scales range from %f-%f"%(self.scales[0],self.scales[-1]) for i in xrange(n_bin_smoothings): print >>stderr,"doing binomial smooth #%d"%i cp_data=ndi.convolve1d(cp_data,smoothing_kernel)/np.sum(smoothing_kernel) transitions_by_scale = {} print >>stderr, "finding contours..." for i_scale,scale in enumerate(self.scales): stderr.write("%.2f "%(scale)) stderr.flush() g1=ndi.gaussian_filter1d(cp_data,scale,order=1) g2=ndi.gaussian_filter1d(cp_data,scale,order=2) edges,pos_edges,neg_edges = self.get_n_edges(g1,g2) self.der1[i_scale,:]=g1 self.der2[i_scale,:]=pos_edges-neg_edges transitions_by_scale[scale]=(edges,pos_edges,neg_edges) stderr.write("done\n") self.contour_intersects,x_intercept_to_scale=get_contours(self.der2) ######NOW we have all the per-scale contours #print contour_intersects edges_passing_cutoff =[] curr_all_edges=[] curr_all_edges_scales=[] #take all the edges discovered at some scale for scale,edges in self.contour_intersects.iteritems(): curr_all_edges.extend(edges) curr_all_edges_scales.extend([scale for i in xrange(len(edges))]) if scale >=cutoff_scale: edges_passing_cutoff.extend(edges) edges_passing_cutoff=sorted(set(edges_passing_cutoff)) all_edges_scales=sorted(zip(curr_all_edges,curr_all_edges_scales)) stderr.write("hierarchically merging segments\n") t = time.time() segments_s, segments_e, cps = c_hierarch_merge_edges(cp_data, edges_passing_cutoff, max_merge, use_means, self.n_wnds, self.starts, self.ends) #segments_s, segments_e, cps = hierarch_merge_edges(cp_data, # edges_passing_cutoff, # max_merge,use_means) self.segment_edges=(segments_s,segments_e,cps) print >>stderr, "hierarchical clustering completed in %fs"%(time.time()-t)
def __init__(self, chr, cp_data, starts, ends, cutoff_scale, **kwargs): max_merge = kwargs.get("max_merge", 0.5) use_means = kwargs.get("use_means", False) n_scales = kwargs.get("n_scales", 51) #n_scales=kwargs.get("n_scales",30) scale_width = kwargs.get('scale_width', 1) n_bin_smoothings = kwargs.get('-n_bin_smoothings', 0) smoothing_kernel = kwargs.get('smoothing_kernel', np.array([1, 2, 1])) self.chr = chr self.cutoff_scale = cutoff_scale self.scales = list(np.arange(1, n_scales, scale_width)) self.starts = starts self.ends = ends self.n_wnds = self.starts.shape[0] self.cp_data = cp_data self.der1 = np.zeros((len(self.scales), self.n_wnds), dtype=np.float32) self.der2 = np.zeros((len(self.scales), self.n_wnds), dtype=np.int8) self.vars = get_windowed_variance(cp_data.astype(np.float64), 500) self.l_vars = np.roll(self.vars, 501) self.r_vars = np.roll(self.vars, -501) print("scales range from %f-%f" % (self.scales[0], self.scales[-1]), file=stderr) for i in range(n_bin_smoothings): print("doing binomial smooth #%d" % i, file=stderr) cp_data = ndi.convolve1d( cp_data, smoothing_kernel) / np.sum(smoothing_kernel) transitions_by_scale = {} print("finding contours...", file=stderr) for i_scale, scale in enumerate(self.scales): stderr.write("%.2f " % (scale)) stderr.flush() g1 = ndi.gaussian_filter1d(cp_data, scale, order=1) g2 = ndi.gaussian_filter1d(cp_data, scale, order=2) edges, pos_edges, neg_edges = self.get_n_edges(g1, g2) self.der1[i_scale, :] = g1 self.der2[i_scale, :] = pos_edges - neg_edges transitions_by_scale[scale] = (edges, pos_edges, neg_edges) stderr.write("done\n") self.contour_intersects, x_intercept_to_scale = get_contours(self.der2) ######NOW we have all the per-scale contours #print contour_intersects edges_passing_cutoff = [] curr_all_edges = [] curr_all_edges_scales = [] #take all the edges discovered at some scale for scale, edges in self.contour_intersects.items(): curr_all_edges.extend(edges) curr_all_edges_scales.extend([scale for i in range(len(edges))]) if scale >= cutoff_scale: edges_passing_cutoff.extend(edges) edges_passing_cutoff = sorted(set(edges_passing_cutoff)) all_edges_scales = sorted(zip(curr_all_edges, curr_all_edges_scales)) stderr.write("hierarchically merging segments\n") t = time.time() segments_s, segments_e, cps = c_hierarch_merge_edges( cp_data, edges_passing_cutoff, max_merge, use_means, self.n_wnds, self.starts, self.ends) #segments_s, segments_e, cps = hierarch_merge_edges(cp_data, # edges_passing_cutoff, # max_merge,use_means) self.segment_edges = (segments_s, segments_e, cps) print("hierarchical clustering completed in %fs" % (time.time() - t), file=stderr)