Ejemplo n.º 1
0
 def _getSignificantData(self, sig_lvl):
     '''Find which edges significant at passed level and set self properties.
     '''
     rows,cols = self.data.shape #rows = cols
     mask = zeros((rows,cols))
     mask[tril_indices(rows,0)] = 1 #preparing mask
     cvals = unique(self.data[triu_indices(rows,1)]) # cvals is sorted
     # calculate upper bound, i.e. what value in the distribution of values 
     # has sig_lvl fraction of the data higher than or equal to it. this is
     # not guaranteed to be precise because of repeated values. for instance 
     # assume the distribution of dissimilarity values is:
     # [.1, .2, .2, .2, .2, .3, .4, .5, .6, .6, .6, .6, .6, .6, .7] 
     # and you want sig_lvl=.2, i.e. you get 20 percent of the linkages as 
     # significant. this would result in choosing the score .6 since its the
     # third in the ordered list (of 15 elements, 3/15=.2). but, since there
     # is no a-priori way to tell which of the multiple .6 linkages are 
     # significant, we select all of them, forcing our lower bound to 
     # encompass 7/15ths of the data. the round call on the ub is to avoid
     # documented numpy weirdness where it will misassign >= calls for long
     # floats. 
     ub = round(cvals[-round(sig_lvl*len(cvals))],7)
     mdata = ma(self.data, mask)
     self.actual_sig_lvl = \
         (mdata >= ub).sum()/float(mdata.shape[0]*(mdata.shape[0]-1)/2)
     self.sig_edges = where(mdata >= ub, 1, 0).nonzero()
     self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
     self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
     self.sig_otus = list(set(self.otu1+self.otu2))
     self.edges = zip(self.otu1, self.otu2)
     self.cvals = mdata[self.sig_edges[0], self.sig_edges[1]]
Ejemplo n.º 2
0
 def _getSignificantData(self, sig_lvl, empirical):
     '''Find which edges significant at passed level and set self properties.
     '''
     rows,cols = self.data.shape #rows = cols
     if empirical:
         mask = zeros((rows,cols))
         mask[tril_indices(rows,0)] = 1 #preparing mask
         # cvals = list(set(self.cdata[triu_indices(rows,-1)]))
         # cvals.sort()
         cvals = unique(self.cdata[triu_indices(rows,-1)])
         alpha = sig_lvl/2.
         lb = round(cvals[floor(alpha*len(cvals))],7)
         ub = round(cvals[-ceil(alpha*len(cvals))],7)
         if sig_lvl==0.:
             lb = -inf
             ub = inf
         mdata = ma(self.cdata, mask)
         if lb==ub:
             # overcount is going to happen 
             print 'lb, ub: %s %s' % (lb, ub), (mdata>=ub).sum(), (mdata<=lb).sum(), (mdata==lb).sum(), (mdata==ub).sum(), lb==ub
         # because of the floor and ceil calculations we used >= for the 
         # upper and lower bound calculations. as an example, assume you have
         # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5 
         # values on each side. Since we don't know what the pvalue is for 
         # the 2.5th value in the list (it DNE), we round down to the 2nd 
         # 2nd value for the lower bound, and round up to the 98th value for
         # the upper bound.
         upper_sig_edges = where(mdata>=ub,1,0).nonzero()
         lower_sig_edges = where(mdata<=lb,1,0).nonzero()
         e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]])
         e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]])
         self.sig_edges = (e1,e2)
         self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
         self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
         self.sig_otus = list(set(self.otu1+self.otu2))
         self.edges = zip(self.otu1, self.otu2)
         self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0],
             self.sig_edges[1])]
         #print sig_lvl, len(self.sig_edges[0]), self.cdata.shape, lb, ub, self.sig_edges[0][:10], self.sig_edges[1][:10]
         #print alpha, lb, ub, kfhf
     else:
         # correlation metrics are symmetric: adjust values of lower triangle  
         # to be larger than sig_lvl means only upper triangle values get 
         # chosen.
         # data is nxn matrix
         # sig edges is tuple of arrays corresponding to row,col indices
         self.sig_edges = \
             ((tril(10*ones((rows, cols)),0)+self.data)<=sig_lvl).nonzero()
         self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
         self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
         self.sig_otus = list(set(self.otu1+self.otu2))
         self.edges = zip(self.otu1, self.otu2)
         self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0],
             self.sig_edges[1])]
Ejemplo n.º 3
0
 def _getSignificantData(self, sig_lvl):
     '''Find which edges significant at passed level and set self properties.
     '''
     rows,cols = self.data.shape #rows = cols
     mask = zeros((rows,cols))
     mask[tril_indices(rows,0)] = 1 #preparing mask
     # cvals = list(set(self.cdata[triu_indices(rows,-1)]))
     # cvals.sort()
     cvals = unique(self.cdata[triu_indices(rows,-1)])
     alpha = sig_lvl/2.
     lb = round(cvals[floor(alpha*len(cvals))],7)
     ub = round(cvals[-ceil(alpha*len(cvals))],7)
     if sig_lvl==0.:
         lb = -inf
         ub = inf
     mdata = ma(self.cdata, mask)
     if lb==ub:
         # overcount is going to happen 
         print 'lb, ub: %s %s' % (lb, ub), (mdata>=ub).sum(), (mdata<=lb).sum(), (mdata==lb).sum(), (mdata==ub).sum(), lb==ub
     # because of the floor and ceil calculations we used >= for the 
     # upper and lower bound calculations. as an example, assume you have
     # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5 
     # values on each side. Since we don't know what the pvalue is for 
     # the 2.5th value in the list (it DNE), we round down to the 2nd 
     # 2nd value for the lower bound, and round up to the 98th value for
     # the upper bound.
     upper_sig_edges = where(mdata>=ub,1,0).nonzero()
     lower_sig_edges = where(mdata<=lb,1,0).nonzero()
     e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]])
     e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]])
     self.sig_edges = (e1,e2)
     self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
     self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
     self.sig_otus = list(set(self.otu1+self.otu2))
     self.edges = zip(self.otu1, self.otu2)
     self.pvals = [self.data[i][j] for i,j in zip(self.sig_edges[0],
         self.sig_edges[1])]
Ejemplo n.º 4
0
    def _getSignificantData(self, sig_lvl, empirical, corr_filter):
        '''Find which edges significant at passed level and set self properties.
        '''
        rows, cols = self.pdata.shape  #rows = cols
        if empirical and corr_filter:
            raise ValueError('cant have both empirical and pearson filter')

        if corr_filter is not None:
            # find edges which are significant enough based on sig_lvl
            se = self.pdata <= sig_lvl
            # find edges which are significant enough based on corr_filter
            pe = abs(self.cdata) >= corr_filter
            self.sig_edges = triu(se * pe, 1).nonzero()
            self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
            self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
            self.sig_otus = list(set(self.otu1 + self.otu2))
            self.edges = zip(self.otu1, self.otu2)
            self.pvals = [
                self.pdata[i][j]
                for i, j in zip(self.sig_edges[0], self.sig_edges[1])
            ]
        else:
            if empirical:
                mask = zeros((rows, cols))
                mask[tril_indices(rows, 0)] = 1  #preparing mask
                # cvals = list(set(self.cdata[triu_indices(rows,-1)]))
                # cvals.sort()
                cvals = unique(self.cdata[triu_indices(rows, 1)])
                alpha = sig_lvl / 2.
                lb = round(cvals[floor(alpha * len(cvals))], 7)
                ub = round(cvals[-ceil(alpha * len(cvals))], 7)
                if sig_lvl == 0.:
                    lb = -inf
                    ub = inf
                mdata = ma(self.cdata, mask)
                if lb == ub:
                    # overcount is going to happen
                    print 'lb, ub: %s %s' % (lb, ub), (mdata >= ub).sum(), (
                        mdata <= lb).sum(), (mdata == lb).sum(), (
                            mdata == ub).sum(), lb == ub
                # because of the floor and ceil calculations we used >= for the
                # upper and lower bound calculations. as an example, assume you have
                # 100 pvals, and are choosing sig_lvl=.05. Then you will pick 2.5
                # values on each side. Since we don't know what the pvalue is for
                # the 2.5th value in the list (it DNE), we round down to the 2nd
                # 2nd value for the lower bound, and round up to the 98th value for
                # the upper bound.
                upper_sig_edges = where(mdata >= ub, 1, 0).nonzero()
                lower_sig_edges = where(mdata <= lb, 1, 0).nonzero()
                e1 = hstack([upper_sig_edges[0], lower_sig_edges[0]])
                e2 = hstack([upper_sig_edges[1], lower_sig_edges[1]])
                self.sig_edges = (e1, e2)
                self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
                self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
                self.sig_otus = list(set(self.otu1 + self.otu2))
                self.edges = zip(self.otu1, self.otu2)
                self.pvals = [
                    self.pdata[i][j]
                    for i, j in zip(self.sig_edges[0], self.sig_edges[1])
                ]
                #print sig_lvl, len(self.sig_edges[0]), self.cdata.shape, lb, ub, self.sig_edges[0][:10], self.sig_edges[1][:10]
                #print alpha, lb, ub, kfhf
            else:
                # correlation metrics are symmetric: adjust values of lower triangle
                # to be larger than sig_lvl means only upper triangle values get
                # chosen.
                # data is nxn matrix
                # sig edges is tuple of arrays corresponding to row,col indices
                tmp = (self.pdata <= sig_lvl)
                tmp[tril_indices(self.pdata.shape[0], 0)] = 0
                self.sig_edges = tmp.nonzero()
                self.otu1 = [self.otu_ids[i] for i in self.sig_edges[0]]
                self.otu2 = [self.otu_ids[i] for i in self.sig_edges[1]]
                self.sig_otus = list(set(self.otu1 + self.otu2))
                self.edges = zip(self.otu1, self.otu2)
                self.pvals = [
                    self.pdata[i][j]
                    for i, j in zip(self.sig_edges[0], self.sig_edges[1])
                ]
Ejemplo n.º 5
0
def colorgrid_plot(means):
    #Create grid spacing
    pad_col_grid_after = [1, 5]
    pad_row_grid_after = [2]
    pad_size = 0.2
    normal_size = 1

    def pad_spacing(no, pad_size, normal_size, gaps):
        count = 0
        spacing = [0]
        for x in range(1, no + 1):
            count += normal_size
            spacing.append(count)
            if x in gaps:
                count += pad_size
                spacing.append(count)
        return spacing

    x_spacing = pad_spacing(len(means.columns), pad_size, normal_size,
                            pad_col_grid_after)
    y_spacing = pad_spacing(len(means.index), pad_size, normal_size,
                            pad_row_grid_after)
    x_spacing_mgrid, y_spacing_mgrid = np.meshgrid(x_spacing, y_spacing)

    #insert dummy columns/rows
    l = 0
    for x in pad_col_grid_after:
        means.insert(x + l, 'dummy-' + str(l), np.NaN)
        l += 1
    none_row = pd.DataFrame(means.iloc[0, :].copy()).T
    none_row[:] = np.NaN
    none_row.index = ['dummy-row']
    l = 0
    for x in pad_row_grid_after:
        means = pd.concat(
            [means.iloc[0:(x + l)], none_row, means.iloc[(x + l):]],
            axis=0,
            sort=False)
        l += 1

    #Create figure spacing
    f2 = plt.figure(figsize=(10.25, 5.25))
    grid = plt.GridSpec(len(means.index), len(means.columns) + 1, figure=f2)
    axs2 = []
    axs2.append(plt.subplot(grid[0:len(means.index), 0:(len(means.columns))]))

    l = 0

    #Iterate over variables of interest
    for idx, row in means.iterrows():
        #Create a masked array of the variable of interest
        bool_mask = np.array(np.ones(means.shape), dtype=bool)
        bool_mask[l] = np.array(np.zeros(means.shape[1]), dtype=bool)
        bool_mask[means.isna()] = False
        means_masked = ma(means.values, bool_mask)

        #Pick colormaps depending on whether negative is good or bad
        if idx in ['reservoir_volume', 'raw_river_conc']:
            cmap = mpl.cm.PiYG
        else:
            cmap = mpl.cm.PiYG_r

        #Shift the colormap to set the neutral colour to 0
        shifted_cmap = shiftedColorMap(
            cmap,
            start=(
                1 - 0.5 - abs(min(means.values[l])) /
                max(abs(max(means.values[l])), abs(min(means.values[l]))) / 2),
            stop=(
                0.5 + abs(max(means.values[l])) /
                max(abs(max(means.values[l])), abs(min(means.values[l]))) / 2),
            name='shifted')

        #Plot color grid
        pm = axs2[0].pcolormesh(x_spacing_mgrid,
                                y_spacing_mgrid,
                                means_masked,
                                linewidth=4,
                                edgecolors='w',
                                cmap=shifted_cmap)
        if idx != 'dummy-row':
            #Create axis for colorbar
            axs2.append(
                plt.subplot(grid[len(means.index) - l - 1,
                                 len(means.columns)]))

            #Create colorbar and set axis invisible
            cb = plt.colorbar(pm, ax=axs2[-1], aspect=10, pad=-10)
            axs2[-1].set_axis_off()

        l += 1

    #Set ticks and labels
    x_spacing = np.array(x_spacing) + 0.5
    for x in pad_col_grid_after:
        x_spacing = np.delete(x_spacing, x)

    y_spacing = np.array(y_spacing) + 0.5
    for y in pad_row_grid_after:
        y_spacing = np.delete(y_spacing, y)

    axs2[0].set_xticks(x_spacing[0:-1])
    axs2[0].set_yticks(y_spacing[0:-1])
    axs2[0].set_xticklabels(labels=means.dropna(axis=1, how='all').columns,
                            rotation=45)
    axs2[0].set_yticklabels(labels=means.dropna(axis=0, how='all').index)
    axs2[0].set_aspect(
        1)  #If you want squares (but can screw with the colorbars)
    return f2