def _getSignificantData(self, sig_lvl): '''Find which edges significant at passed level and set self properties. ''' rows,cols = self.data.shape #rows = cols mask = zeros((rows,cols)) mask[tril_indices(rows,0)] = 1 #preparing mask cvals = unique(self.data[triu_indices(rows,1)]) # cvals is sorted # calculate upper bound, i.e. what value in the distribution of values # has sig_lvl fraction of the data higher than or equal to it. this is # not guaranteed to be precise because of repeated values. for instance # assume the distribution of dissimilarity values is: # [.1, .2, .2, .2, .2, .3, .4, .5, .6, .6, .6, .6, .6, .6, .7] # and you want sig_lvl=.2, i.e. you get 20 percent of the linkages as # significant. this would result in choosing the score .6 since its the # third in the ordered list (of 15 elements, 3/15=.2). but, since there # is no a-priori way to tell which of the multiple .6 linkages are # significant, we select all of them, forcing our lower bound to # encompass 7/15ths of the data. the round call on the ub is to avoid # documented numpy weirdness where it will misassign >= calls for long # floats. ub = round(cvals[-round(sig_lvl*len(cvals))],7) mdata = ma(self.data, mask) self.actual_sig_lvl = \ (mdata >= ub).sum()/float(mdata.shape[0]*(mdata.shape[0]-1)/2) self.sig_edges = where(mdata >= ub, 1, 0).nonzero() self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1+self.otu2)) self.edges = zip(self.otu1, self.otu2) self.cvals = mdata[self.sig_edges[0], self.sig_edges[1]]
def _getSignificantData(self, sig_lvl, empirical): '''Find which edges significant at passed level and set self properties. ''' rows,cols = self.data.shape #rows = cols if empirical: mask = zeros((rows,cols)) mask[tril_indices(rows,0)] = 1 #preparing mask # cvals = list(set(self.cdata[triu_indices(rows,-1)])) # cvals.sort() cvals = unique(self.cdata[triu_indices(rows,-1)]) alpha = sig_lvl/2. lb = round(cvals[floor(alpha*len(cvals))],7) ub = round(cvals[-ceil(alpha*len(cvals))],7) if sig_lvl==0.: lb = -inf ub = inf mdata = ma(self.cdata, mask) if lb==ub: # overcount is going to happen print 'lb, ub: %s %s' % (lb, ub), (mdata>=ub).sum(), (mdata<=lb).sum(), (mdata==lb).sum(), (mdata==ub).sum(), lb==ub # because of the floor and ceil calculations we used >= for the # upper and lower bound calculations. as an example, assume you have # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5 # values on each side. Since we don't know what the pvalue is for # the 2.5th value in the list (it DNE), we round down to the 2nd # 2nd value for the lower bound, and round up to the 98th value for # the upper bound. upper_sig_edges = where(mdata>=ub,1,0).nonzero() lower_sig_edges = where(mdata<=lb,1,0).nonzero() e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]]) e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]]) self.sig_edges = (e1,e2) self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1+self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0], self.sig_edges[1])] #print sig_lvl, len(self.sig_edges[0]), self.cdata.shape, lb, ub, self.sig_edges[0][:10], self.sig_edges[1][:10] #print alpha, lb, ub, kfhf else: # correlation metrics are symmetric: adjust values of lower triangle # to be larger than sig_lvl means only upper triangle values get # chosen. # data is nxn matrix # sig edges is tuple of arrays corresponding to row,col indices self.sig_edges = \ ((tril(10*ones((rows, cols)),0)+self.data)<=sig_lvl).nonzero() self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1+self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0], self.sig_edges[1])]
def _getSignificantData(self, sig_lvl): '''Find which edges significant at passed level and set self properties. ''' rows,cols = self.data.shape #rows = cols mask = zeros((rows,cols)) mask[tril_indices(rows,0)] = 1 #preparing mask # cvals = list(set(self.cdata[triu_indices(rows,-1)])) # cvals.sort() cvals = unique(self.cdata[triu_indices(rows,-1)]) alpha = sig_lvl/2. lb = round(cvals[floor(alpha*len(cvals))],7) ub = round(cvals[-ceil(alpha*len(cvals))],7) if sig_lvl==0.: lb = -inf ub = inf mdata = ma(self.cdata, mask) if lb==ub: # overcount is going to happen print 'lb, ub: %s %s' % (lb, ub), (mdata>=ub).sum(), (mdata<=lb).sum(), (mdata==lb).sum(), (mdata==ub).sum(), lb==ub # because of the floor and ceil calculations we used >= for the # upper and lower bound calculations. as an example, assume you have # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5 # values on each side. Since we don't know what the pvalue is for # the 2.5th value in the list (it DNE), we round down to the 2nd # 2nd value for the lower bound, and round up to the 98th value for # the upper bound. upper_sig_edges = where(mdata>=ub,1,0).nonzero() lower_sig_edges = where(mdata<=lb,1,0).nonzero() e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]]) e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]]) self.sig_edges = (e1,e2) self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1+self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0], self.sig_edges[1])]
def _getSignificantData(self, sig_lvl, empirical, corr_filter): '''Find which edges significant at passed level and set self properties. ''' rows, cols = self.pdata.shape #rows = cols if empirical and corr_filter: raise ValueError('cant have both empirical and pearson filter') if corr_filter is not None: # find edges which are significant enough based on sig_lvl se = self.pdata <= sig_lvl # find edges which are significant enough based on corr_filter pe = abs(self.cdata) >= corr_filter self.sig_edges = triu(se * pe, 1).nonzero() self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1 + self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [ self.pdata[i][j] for i, j in zip(self.sig_edges[0], self.sig_edges[1]) ] else: if empirical: mask = zeros((rows, cols)) mask[tril_indices(rows, 0)] = 1 #preparing mask # cvals = list(set(self.cdata[triu_indices(rows,-1)])) # cvals.sort() cvals = unique(self.cdata[triu_indices(rows, 1)]) alpha = sig_lvl / 2. lb = round(cvals[floor(alpha * len(cvals))], 7) ub = round(cvals[-ceil(alpha * len(cvals))], 7) if sig_lvl == 0.: lb = -inf ub = inf mdata = ma(self.cdata, mask) if lb == ub: # overcount is going to happen print 'lb, ub: %s %s' % (lb, ub), (mdata >= ub).sum(), ( mdata <= lb).sum(), (mdata == lb).sum(), ( mdata == ub).sum(), lb == ub # because of the floor and ceil calculations we used >= for the # upper and lower bound calculations. as an example, assume you have # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5 # values on each side. Since we don't know what the pvalue is for # the 2.5th value in the list (it DNE), we round down to the 2nd # 2nd value for the lower bound, and round up to the 98th value for # the upper bound. upper_sig_edges = where(mdata >= ub, 1, 0).nonzero() lower_sig_edges = where(mdata <= lb, 1, 0).nonzero() e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]]) e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]]) self.sig_edges = (e1, e2) self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1 + self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [ self.pdata[i][j] for i, j in zip(self.sig_edges[0], self.sig_edges[1]) ] #print sig_lvl, len(self.sig_edges[0]), self.cdata.shape, lb, ub, self.sig_edges[0][:10], self.sig_edges[1][:10] #print alpha, lb, ub, kfhf else: # correlation metrics are symmetric: adjust values of lower triangle # to be larger than sig_lvl means only upper triangle values get # chosen. # data is nxn matrix # sig edges is tuple of arrays corresponding to row,col indices tmp = (self.pdata <= sig_lvl) tmp[tril_indices(self.pdata.shape[0], 0)] = 0 self.sig_edges = tmp.nonzero() self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]] self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]] self.sig_otus = list(set(self.otu1 + self.otu2)) self.edges = zip(self.otu1, self.otu2) self.pvals = [ self.pdata[i][j] for i, j in zip(self.sig_edges[0], self.sig_edges[1]) ]
def colorgrid_plot(means): #Create grid spacing pad_col_grid_after = [1, 5] pad_row_grid_after = [2] pad_size = 0.2 normal_size = 1 def pad_spacing(no, pad_size, normal_size, gaps): count = 0 spacing = [0] for x in range(1, no + 1): count += normal_size spacing.append(count) if x in gaps: count += pad_size spacing.append(count) return spacing x_spacing = pad_spacing(len(means.columns), pad_size, normal_size, pad_col_grid_after) y_spacing = pad_spacing(len(means.index), pad_size, normal_size, pad_row_grid_after) x_spacing_mgrid, y_spacing_mgrid = np.meshgrid(x_spacing, y_spacing) #insert dummy columns/rows l = 0 for x in pad_col_grid_after: means.insert(x + l, 'dummy-' + str(l), np.NaN) l += 1 none_row = pd.DataFrame(means.iloc[0, :].copy()).T none_row[:] = np.NaN none_row.index = ['dummy-row'] l = 0 for x in pad_row_grid_after: means = pd.concat( [means.iloc[0:(x + l)], none_row, means.iloc[(x + l):]], axis=0, sort=False) l += 1 #Create figure spacing f2 = plt.figure(figsize=(10.25, 5.25)) grid = plt.GridSpec(len(means.index), len(means.columns) + 1, figure=f2) axs2 = [] axs2.append(plt.subplot(grid[0:len(means.index), 0:(len(means.columns))])) l = 0 #Iterate over variables of interest for idx, row in means.iterrows(): #Create a masked array of the variable of interest bool_mask = np.array(np.ones(means.shape), dtype=bool) bool_mask[l] = np.array(np.zeros(means.shape[1]), dtype=bool) bool_mask[means.isna()] = False means_masked = ma(means.values, bool_mask) #Pick colormaps depending on whether negative is good or bad if idx in ['reservoir_volume', 'raw_river_conc']: cmap = mpl.cm.PiYG else: cmap = mpl.cm.PiYG_r #Shift the colormap to set the neutral colour to 0 shifted_cmap = shiftedColorMap( cmap, start=( 1 - 0.5 - abs(min(means.values[l])) / max(abs(max(means.values[l])), abs(min(means.values[l]))) / 2), stop=( 0.5 + abs(max(means.values[l])) / max(abs(max(means.values[l])), abs(min(means.values[l]))) / 2), name='shifted') #Plot color grid pm = axs2[0].pcolormesh(x_spacing_mgrid, y_spacing_mgrid, means_masked, linewidth=4, edgecolors='w', cmap=shifted_cmap) if idx != 'dummy-row': #Create axis for colorbar axs2.append( plt.subplot(grid[len(means.index) - l - 1, len(means.columns)])) #Create colorbar and set axis invisible cb = plt.colorbar(pm, ax=axs2[-1], aspect=10, pad=-10) axs2[-1].set_axis_off() l += 1 #Set ticks and labels x_spacing = np.array(x_spacing) + 0.5 for x in pad_col_grid_after: x_spacing = np.delete(x_spacing, x) y_spacing = np.array(y_spacing) + 0.5 for y in pad_row_grid_after: y_spacing = np.delete(y_spacing, y) axs2[0].set_xticks(x_spacing[0:-1]) axs2[0].set_yticks(y_spacing[0:-1]) axs2[0].set_xticklabels(labels=means.dropna(axis=1, how='all').columns, rotation=45) axs2[0].set_yticklabels(labels=means.dropna(axis=0, how='all').index) axs2[0].set_aspect( 1) #If you want squares (but can screw with the colorbars) return f2