Beispiel #1
0
 def _SNR(self, minimum_noise_level=0.001):
   '''
   Calculate signal to noise ratio. Signal is the highest
   CWT intensity of all scales, noise is the 95% quantile
   of the lowest scale WT, which is dominated by noise.
   '''
   ridge_info=self.ridge_info
   cwt=self.CWT.getdata()
   noise_cwt=cwt[0]
   # minimum noise is the noise value for the whole dataset times minimum_noise_level
   minimum_noise=float(minimum_noise_level*mquantiles(
                       noise_cwt,
                       0.95,
                       3./8., 3./8.))
   for info in ridge_info:
     scale=max(3, info[2]) # get a minimal width of 30 items for noise calculation
     signal=info[3]
     base_left=max(0, int(info[1]-scale*5))
     base_right=int(info[1]+scale*5)
     noise=mquantiles(noise_cwt[base_left:base_right+1],
                      0.95,
                      3./8., 3./8.)
     noise=numpy.nan_to_num(noise)
     noise=float(max([minimum_noise, noise]))
     info.append(signal/noise)
Beispiel #2
0
def makeReferenceRLHistogram(alnRatios, refLengths, outfile, format, quantile=None):
    fig = plt.figure(dpi=300, figsize=(6, 6))
    ax = fig.add_subplot(111)
    ax.set_title("Aligned References RL Density Plot")

    fullPass = alnRatios[alnRatios['IsFullPass']]
    HQRegion = alnRatios[n.any([alnRatios['IsFullPass'], alnRatios['IsHQTrimmed']], axis=0)]

    max_y = 0
    for l, label in zip((alnRatios, HQRegion, fullPass), ("Aln from All Subreads", "Aln from HQRegion Subreads", "Aln from HQRegion Full-Pass Subreads")):
        alnRefLength = l['RefLength']

        if not quantile == None:
            alnRefLength = alnRefLength[alnRefLength < mstats.mquantiles(alnRefLength, [quantile])[0]]

        num, bins, patches = ax.hist(alnRefLength, bins=100, histtype='step', label=label, normed=True)

        if n.max(num) > max_y:
            max_y = n.max(num)

    if not quantile == None:
        refLengths = refLengths[refLengths < mstats.mquantiles(refLengths, [quantile])[0]]

    num, bins, patches = ax.hist(refLengths, bins=100, histtype='step', label="All References", normed=True)

    if n.max(num) > max_y:
        max_y = n.max(num)

    ax.set_ylim(0, max_y * 1.1)
    ax.legend(loc='upper center', prop={'size': 'small'})
    fig.savefig(outfile, format=format)
Beispiel #3
0
 def bootstrap(self,pred,expect) :
   """
   Calculate bootstrapped values
   
   Parameters
   ----------
   pred : numpy array
     the bootstrapped predicted values
   expect : numpy array
     the bootstrapped expected values
   """
   nboots = pred.shape[1]
   nval = pred.shape[0]
   if nboots < 1 : return
   self.bootstrapped = np.zeros(nboots)
   for i in range(nboots) :
     self.bootstrapped[i] = self.eval(pred[:,i],expect[:,i])
   self.delta = self.bootstrapped - self.biased
   self.std = np.std(self.bootstrapped,ddof=1)
   self.av = np.sum(self.bootstrapped)/nboots
   self.bias = np.sum(self.delta)/nboots
   self.unbiased = self.biased+self.bias
   self.median = np.median(self.bootstrapped)
   self.nlow = self.lower(self.biased - 1.96*self.std/np.sqrt(nval))
   self.nhigh = self.upper(self.biased + 1.96*self.std/np.sqrt(nval))
   self.dlow = self.lower(self.unbiased - stat.mquantiles(self.delta,prob=[0.95]))
   self.dhigh = self.upper(self.unbiased - stat.mquantiles(self.delta,prob=[0.05]))
def distribution():
    data = get_statistics()        
    ratio = data['OrderQty'] / data['adv']
    print data
    filter_ratio = ratio[ratio > 0.05]
    count,division = np.histogram(filter_ratio, 0.025 * np.arange(80))
    
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)
    filter_ratio.hist(ax = ax1, bins = division)
    
    fig2 = plt.figure()
    ax2 = fig2.add_subplot(111)
    ax2.set_title('Repartition of Order Turnover (greater than 1M)')
    ax2.set_xlabel('Order Turnover (Millions of Euros)')
    ax2.set_ylabel('Number of Orders')
    ax2.xaxis.set_major_formatter(FixedOrderFormatter(6))
    filtered_turnover = data[np.logical_and(np.isfinite(data['turnover']), data['turnover']>0)]      
    turnover = filtered_turnover['turnover'] * filtered_turnover['rate_to_euro']      
    count,division = np.histogram(turnover, bins = np.arange(1e6, max(turnover), 2.5e5))    
    turnover.hist(ax = ax2, bins = division, color = kc_main_colors()['dark_blue'])
 
    print mquantiles(turnover.values, [0.99, 0.995, 0.999])
    
    plt.show()
Beispiel #5
0
def main(args):
    (training_file, label_file, test_file, test_label, c, e) = args
    svr = SVR(C=float(c), epsilon=float(e), kernel='rbf')
    X = load_feat(training_file)
    y = [float(line.strip()) for line in open(label_file)]
    
    X = np.asarray(X)
     
    y = np.asarray(y)
    
    test_X = load_feat(test_file)
    test_X = np.asarray(test_X)
    test_X[np.isnan(test_X)] = 0

    svr.fit(X, y)
    
    pred = svr.predict(test_X)
    if test_label != 'none':
        test_y = [float(line.strip()) for line in open(test_label)]
        test_y = np.asarray(test_y)
        print 'MAE: ', mean_absolute_error(test_y, pred)
        print 'RMSE: ', sqrt(mean_squared_error(test_y, pred))
        print 'corrpearson: ', sp.stats.pearsonr(test_y, pred)
        print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2
        print mquantiles(test_y, prob=[0.10, 0.90])
        print mquantiles(pred, prob=[0.10, 0.90])
    with open(test_file + '.svr.pred', 'w') as output:
        for p in pred:
            print >>output, p
    return
Beispiel #6
0
    def get_map(self, name, burn, thinning, method='fit'):
        from scipy.stats.mstats import mquantiles
        d = name if isinstance(name, np.ndarray) else self.get_samples(burn, thinning, name)
        method = method.lower()

        if method.lower() == 'fit':
             map, ep, em = fit_distribution(d)
        elif method.lower() == 'fit2':
            map, ep, em, map2, ep2, em2, x = fit_distribution2(d)
            tt = np.linspace(d.min(), d.max(), 500)
            ft = ((1-x)*asymmetric_gaussian(tt, map, ep, em)
                  + x*asymmetric_gaussian(tt, map2, ep2, em2))
            map = tt[ft.argmax()]
            em  = mquantiles(d[d<map],[1-2*0.341])[0]
            ep  = mquantiles(d[d>map],[2*0.341])[0]
        elif method.lower() == 'median':
            map, ep, em = mquantiles(d, [0.5, 0.5-0.341, 0.5+0.341])
        elif method == 'histogram':
            nb, vl = np.histogram(data, res)
            mid    = np.argmax(nb)
            map    = 0.5*(vl[mid]+vl[mid+1])
            em  = mquantiles(d[d<map],[1-2*0.341])[0]
            ep  = mquantiles(d[d>map],[2*0.341])[0]

        return map, ep, em
    def _compute_sig(self):
        """Calculates the significance level of the variable tested"""

        m = self._est_cond_mean()
        Y = self.endog
        X = self.exog
        n = np.shape(X)[0]
        u = Y - m
        u = u - np.mean(u)  # center
        fct1 = (1 - 5**0.5) / 2.
        fct2 = (1 + 5**0.5) / 2.
        u1 = fct1 * u
        u2 = fct2 * u
        r = fct2 / (5 ** 0.5)
        I_dist = np.empty((self.nboot,1))
        for j in range(self.nboot):
            u_boot = copy.deepcopy(u2)

            prob = np.random.uniform(0,1, size = (n,1))
            ind = prob < r
            u_boot[ind] = u1[ind]
            Y_boot = m + u_boot
            I_dist[j] = self._compute_test_stat(Y_boot, X)

        sig = "Not Significant"
        if self.test_stat > mquantiles(I_dist, 0.9):
            sig = "*"
        if self.test_stat > mquantiles(I_dist, 0.95):
            sig = "**"
        if self.test_stat > mquantiles(I_dist, 0.99):
            sig = "***"

        return sig
def sy_integral_function(q, x, y):
	f1_inv = mquantiles(x, [q])
	f2_inv = mquantiles(y, [q])
	
	if ( f1_inv[0] == 0.0 and f2_inv[0] == 0.0 ):
		return 1.0
	else:
		return min(f1_inv[0], f2_inv[0])/float(max(f1_inv[0], f2_inv[0]))
Beispiel #9
0
def _compute_min_std_IQR(data):
    """Compute minimum of std and IQR for each variable."""
    s1 = np.std(data, axis=0)
    q75 = mquantiles(data, 0.75, axis=0).data[0]
    q25 = mquantiles(data, 0.25, axis=0).data[0]
    s2 = (q75 - q25) / 1.349  # IQR
    dispersion = np.minimum(s1, s2)
    return dispersion
Beispiel #10
0
def _makeHexbinHist(x, y, x_label, y_label, title, outfile, format, quantile=None):
    nullfmt = NullFormatter()
    
    left, width = 0.1, 0.6
    bottom, height = 0.1, 0.6
    bottom_h = left_h = left+width+0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    fig = plt.figure(dpi=300, figsize=(8,8))
    fig.suptitle(title)

    axHexbin = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # no labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    if quantile is not None:
        mask = n.all([x < mstats.mquantiles(x, [quantile])[0], y < mstats.mquantiles(y, [quantile])[0]], axis=0)
        x = x[mask]
        y = y[mask]
        
    x_min = n.min(x)
    x_max = n.max(x)
    y_min = n.min(y)
    y_max = n.max(y)

    axHexbin.hexbin(x, y, bins='log', edgecolors='none', cmap=plt.cm.hot)
    axHexbin.set_xlim(x_min, x_max)
    axHexbin.set_xlabel(x_label)
    axHexbin.set_ylim(y_min, y_max)
    axHexbin.set_ylabel(y_label)
    
    if max(x) < 1.: # is a fraction
        bins_for_x = 50
    else:
        bins_for_x = (max(x)-min(x))/100 + 1
    if max(y) < 1.:
        bins_for_y = 50
    else:
        bins_for_y = (max(y)-min(y))/100 + 1
    axHistx.hist(x, bins=bins_for_x)
    axHistx.set_xlim(x_min, x_max)
    axHisty.hist(y, bins=bins_for_y, orientation='horizontal')
    axHisty.set_ylim(y_min, y_max)
    for label in axHisty.get_xticklabels():
        label.set_rotation('vertical')

    fig.savefig(outfile, format=format)
Beispiel #11
0
    def _real_paste(self, box, u):
        '''
        
        returns two candidate new boxes, pasted along upper and lower 
        dimension
        
        :param box: a PrimBox instance
        :param u: the uncertainty for which to paste
        :returns: two box lims and the associated indices
       
        '''

        box_diff = self.box_init[u][1]-self.box_init[u][0]
        pa = self.paste_alpha * box.yi.shape[0]
    
        pastes = []
        for direction in ['upper', 'lower']:
            box_paste = np.copy(box.box_lims[-1])
            test_box = np.copy(box.box_lims[-1])
            
            if direction == 'lower':
                i = 0
                box_diff = -1*box_diff
                test_box[u][1] = test_box[u][i]
                test_box[u][i] = self.box_init[u][i]
                indices = self.in_box(test_box)
                data = self.x[indices][u]
                
                paste_value = self.box_init[u][i]
                if data.shape[0] > 0:
                    b = (data.shape[0]-pa)/data.shape[0]
                    paste_value = mquantiles(data, [b], alphap=self.alpha, 
                                             betap=self.beta)[0]
                
                    
            elif direction == 'upper':
                i = 1
                test_box[u][0] = test_box[u][i]
                test_box[u][i] = self.box_init[u][i]
                indices = self.in_box(test_box)
                data = self.x[indices][u]
                
                paste_value = self.box_init[u][i]
                if data.shape[0] > 0:
                    b = (pa)/data.shape[0]
                    paste_value = mquantiles(data, [b], alphap=self.alpha, 
                                             betap=self.beta)[0]
           
            box_paste[u][i] = paste_value
            indices = self.in_box(box_paste)
            
            pastes.append((indices, box_paste))
    
        return pastes        
def main(fname):
    collision_data = CollisionData.data_from_file(fname)

    t1 = collision_data.type1s
    t2 = collision_data.type2s
    deltaVs = collision_data.deltaVs
    distances = collision_data.distances

    low  = np.min(deltaVs)
    high = np.max(deltaVs)

    #diff_indices = np.where(t1 != t2)
    #diff_deltaVs = deltaVs[diff_indices]
    #diff_hist = gaussian_kde(diff_deltaVs)
    #diff_xs   = np.linspace(low, high, 200)

    #he_indices = np.where(np.logical_and(t1 == 0, t2 == 0))
    #he_deltaVs = deltaVs[he_indices]
    #he_hist = gaussian_kde(he_deltaVs)
    #he_xs   = np.linspace(low, high, 200)
    #he_dist = distances[he_indices]

    xe_indices = np.where(np.logical_and(t1 == 7, t2 == 7))
    xe_deltaVs = deltaVs[xe_indices]

    xe_xs   = np.linspace(low, high, 200)
    xe_dist = distances[xe_indices]
    #print np.mean(xe_dist), np.median(xe_dist), np.mean(distances), np.median(distances)
    print mquantiles(distances, prob=[0.8, 0.85, 0.9, 0.95, 0.975, 0.99])

    if len(xe_deltaVs) <= 1:
        return

    xe_hist = gaussian_kde(xe_deltaVs)

    fig = plt.figure()
    ax  = fig.add_subplot(1, 1, 1)

    # print np.mean(xe_dist)

    # update the view limits
    #ax.plot(diff_xs, diff_hist(diff_xs), c='r', marker='.', label='He-Xe')
    #ax.plot(he_xs, he_hist(he_xs), c='g', marker='.', label='He-He')
    ax.plot(xe_xs, xe_hist(xe_xs), c='b', marker='.', label='Xe-Xe')
    ax.set_xlim(0, high)
    ax.set_title("Collision Radius vs Difference in Velocity")
    ax.set_xlabel("Delta V (m/s)")
    ax.set_ylabel("Relative Density")
    ax.legend()

    fig.savefig(os.path.splitext(fname)[0] + ".png", dpi=250)
    plt.close()
    fig = None
    ax = None
def generate_quantile_summary(result_summary_table, quantiles = numpy.linspace(0, 1, 101)):
    from scipy.stats.mstats import mquantiles
    import pandas
    
    summary_table = pandas.DataFrame.from_items([("id", result_summary_table["id"]), ("rmsd", result_summary_table["quartile"][...,0])])
    
    result = numpy.empty_like(quantiles, dtype=[("quantile", float), ("global_quantile_value", float), ("worst_per_structure_quantile_value", float)])
    result["quantile"] = quantiles
    result["global_quantile_value"] = mquantiles(summary_table["rmsd"].values, quantiles)
    result["worst_per_structure_quantile_value"] = mquantiles(summary_table.groupby("id")["rmsd"].max().values, quantiles)
    
    return result
Beispiel #14
0
def filtering(control_file, affected_file, filtered_control_file, filtered_affected_file, max_pvalue = None, min_cov = None, max_cov = None, min_delta_methylation = None, filter_quantil = None):

    control_quantil = None
    affected_quantil = None
    if filter_quantil:
        control_quantil = mquantiles( np.loadtxt(control_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0]
        affected_quantil = mquantiles( np.loadtxt(affected_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0]

    non_filtered_sites = 0
    for site_counter, (control_line, affected_line) in enumerate( izip(open(control_file), open(affected_file)) ):
        c_chrom, c_start, c_end, c_cov, c_meth, c_strand = control_line.strip().split('\t')
        a_chrom, a_start, a_end, a_cov, a_meth, a_strand = affected_line.strip().split('\t')
        try:
            assert( c_chrom == a_chrom )
            assert( c_start == a_start )
            assert( c_end == a_end )
            assert( c_strand == a_strand )
        except AssertionError:
            sys.exit('That file needs intersected inputfiles, so that each site is present in both files, affected and control.\n %s : %s \n %s : %s \n %s : %s \n %s : %s \n' % (c_chrom, a_chrom, c_start, a_start, c_end, a_end, c_strand, a_strand))

        c_cov, c_meth, a_cov, a_meth = map(float, [c_cov, c_meth, a_cov, a_meth])
        if min_cov != None and (a_cov < min_cov or c_cov < min_cov):
            continue
        if max_cov != None and (a_cov > max_cov or c_cov > max_cov):
            continue
        if min_delta_methylation != None and abs(a_meth - c_meth) < min_delta_methylation:
            continue
        if filter_quantil and (c_cov > control_quantil or a_cov > affected_quantil):
            continue

        if max_pvalue != None:
            control_methylated = c_cov * c_meth / 100
            control_unmethylated = c_cov - control_methylated
            affected_methylated = a_cov * a_meth / 100
            affected_unmethylated = a_cov - affected_methylated
            try:
                #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/
                p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated)
                pvalue = p.two_tail
            except:
                oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided')

            if pvalue > max_pvalue:
                continue

        non_filtered_sites += 1
        filtered_control_file.write(control_line)
        filtered_affected_file.write(affected_line)

    sys.stdout.write( "%s from %s filtered.\n" % (site_counter+1 - non_filtered_sites, site_counter + 1) )
    filtered_affected_file.close()
    filtered_control_file.close()
Beispiel #15
0
    def PreparePloting(self):
 # Set plot ranges and titles

        maxlim = mquantiles(self.m2g.Y*self.m2g.norm, .98)
        minlim = mquantiles(self.m2g.Y*self.m2g.norm, .02)
        if minlim == maxlim: minlim, maxlim=0, 1
        self.ax1.set_ylim(minlim-(maxlim-minlim)*.1, maxlim+(maxlim-minlim)*.3)

        maxlim = mquantiles(self.m2g.K[:-1, :], .98)
        minlim = min(0, mquantiles(self.m2g.K[:-1, :], .02))
        if minlim == maxlim: minlim, maxlim=0, 1
        self.ax2.set_ylim(minlim-(maxlim-minlim)*.1, maxlim+(maxlim-minlim)*.1)
        self.ax1.set_title('Fitted data: '+self.name+',   '+'$\chi^2/doF$: %.2f'%self.m2g.chi2)
        self.plot_step()
Beispiel #16
0
def makeReferenceRLHistogram(alnRatios, refLengths, outfile, format, quantile):
    """
    X-axis: (unique) reference length 
    Y-axis: count
    """
    fig = plt.figure(dpi=300, figsize=(10, 6))
    ax = fig.add_subplot(111)
    ax.set_title("Aligned Reference Length Distribution (qCov>=80%)")

    fullPass = alnRatios[alnRatios['IsFullPass']&(alnRatios['rCov']>=.8)]
    fullLength = alnRatios[alnRatios['IsFullLength']&(alnRatios['rCov']>=.8)]
    
    if quantile is not None:
        refLengths = refLengths[refLengths < mstats.mquantiles(refLengths, [quantile])[0]]
    # plot all references first
    bins = 50
    y,binEdges = n.histogram(refLengths, bins=bins)
    bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
    xnew = n.linspace(bincenters.min(), bincenters.max(), 100)
    ysmooth = spline(bincenters, y, xnew)
    # normalize by hand
    ysmooth = ysmooth*1./sum(ysmooth)
    ax.plot(xnew, ysmooth, '-', label="All References")
    max_y = max(ysmooth)

    for l, label in zip((fullPass, fullLength), ("Aligned full-pass subreads", "Aligned " + SeenName + " subreads")):
        alnRefLength = dict(zip(l['RefID'], l['RefLength']))
        if len(alnRefLength) == 0:
            continue
        alnRefLength = n.array(alnRefLength.values())
        if quantile is not None:
            alnRefLength = alnRefLength[alnRefLength < mstats.mquantiles(alnRefLength, [quantile])[0]]
        bins = (max(alnRefLength)-min(alnRefLength))/100 + 1
        y,binEdges = n.histogram(alnRefLength, bins=bins)
        bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
        xnew = n.linspace(bincenters.min(), bincenters.max(), 300)
        ysmooth = spline(bincenters, y, xnew)
        # normalize by hand
        ysmooth = ysmooth*1./sum(ysmooth)
        ax.plot(xnew, ysmooth, '-', label=label)
        max_y = max(max_y, max(ysmooth))
        #num, bins, patches = ax.hist(alnRefLength, bins=50, histtype='step', label=label, normed=True)
        #max_y = max(max_y, max(num))

    ax.set_ylim(0, max_y * 1.1)
    ax.legend(loc='upper center', prop={'size': 'small'})
    ax.set_xlabel("Reference Length")
    ax.set_ylabel("Fraction")
    fig.savefig(outfile, format=format)
def discretePaste(x_init,y_init, y, name,
              box,box_init, paste_alpha, n, direction, obj_func):
    box_diff = box_init[name][1]-box_init[name][0]
    if direction == 'lower':
        i = 0
        paste_alpha = 1-paste_alpha
        box_diff = -1*box_diff
    if direction == 'upper':
        i = 1
    
    box_paste = np.copy(box)
    y_paste = y
    test_box = np.copy(box)
  
    if direction == 'lower':
        test_box[name][i+1] = test_box[name][i]
        test_box[name][i] = box_init[name][i]
        logical = in_box(x_init, test_box)
        data = x_init[logical][name]
        if data.shape[0] > 0:
            a = paste_alpha * y.shape[0]
            b = (data.shape[0]-a)/data.shape[0]
            paste_value = mquantiles(data, [b], alphap=1/3, betap=1/3)[0]
            paste_value = int(round(paste_value))
            box_paste[name][i] = paste_value
            logical = in_box(x_init, box_paste)
            y_paste = y_init[logical]
    
    if direction == 'upper':
        test_box[name][i-1] = test_box[name][i]
        test_box[name][i] = box_init[name][i]
        logical = in_box(x_init, test_box)
        data = x_init[logical][name]
        if data.shape[0] > 0:
            a = paste_alpha * y.shape[0]
            b = a/data.shape[0]
            paste_value = mquantiles(data, [b], alphap=1/3, betap=1/3)[0]
            paste_value = int(round(paste_value))
            box_paste[name][i] = paste_value
            logical = in_box(x_init, box_paste)
            y_paste = y_init[logical]

    # y means of pasted boxes
    obj = obj_func(y,  y_paste)
    
    # mass of pasted boxes
    mass_paste = y_init[logical].shape[0]/n

    return (obj, mass_paste, box_paste)
def prune(B,H=None,per=.2,deg=True,cap='in'):
	# prune Graph based on degree or properties in cap
	# node based
	# H will always be preserved
	G=nx.DiGraph(B)
	if deg:
		if cap=='in':
			seq=G.in_degree().values()
			cut=mquantiles(seq,1-per)
			mk=1
		elif cap=='out':
			seq=G.out_degree().values()
			cut=mquantiles(seq,1-per)
			mk=2
		else:
			print 'Error in setting cap string'
			return None
	else:
		seq=[]
		S=zip(*G.nodes(True))[1]
		for w in S:
			try:
				seq.append(w[cap])
				mk=3
			except KeyError:
				print 'Some nodes lack cap string'
				return None
		cut=mquantiles(seq,1-per)
	to_del=[]
	for n in G.nodes_iter():
		G.node[n]['size']=2*G.in_degree(n)
		if H is not None:
			if n in H.nodes():
				continue
		if mk==1:
			t=G.in_degree(n)
			if t < cut:
				to_del.append(n)
		elif mk==2:
			t=G.out_degree(n)
			if t < cut:
				to_del.append(n)
		else:
			t=G.node[n][cap]
			if t < cut:
				to_del.append(n)
	G.remove_nodes_from(to_del)
	return G
def create_grouped_index_df(bin_num):
    ## load the labels and start_time column for train and test data
    start_time = time.time()
    train_labels            = pd.read_csv(data_path + train_num_file, index_col='Id', usecols=['Id', dep_var_name])
    train_date_start_columm = pd.read_csv(data_path + train_date_file, index_col='Id', usecols=['Id', start_time_column_name])
    test_date_start_columm  = pd.read_csv(data_path + test_date_file, index_col='Id', usecols=['Id', start_time_column_name])
    end_time = time.time()
    print 'data loading takes ', round((end_time - start_time), 1), ' seconds.'

    ## join the start_time with labels, then drop the NaN in start_time
    labeled_start_time = pd.merge(train_labels, train_date_start_columm, how='left', left_index=True, right_index=True)
    ## this labeled_start_time dataFrame doesn't contain the NaN, therefore it can be directly used for calculating the mquantiles
    labeled_start_time = labeled_start_time[~labeled_start_time[start_time_column_name].isnull()]


    ##section to subset the data by start_time
    prob_list = [1.*i/bin_num for i in range(1, bin_num)]
    quantile_values = mquantiles(labeled_start_time[start_time_column_name], prob=prob_list)

    bins = [labeled_start_time[start_time_column_name].min()]
    bins.extend(quantile_values)
    bins.append(labeled_start_time[start_time_column_name].max())
    bin_names = [str(i) for i in range(len(bins)-1)]

    ## cut the entire dataframe into different time_windows by start_time
    tmp_train = train_date_start_columm.copy()
    tmp_test  = test_date_start_columm.copy()

    tmp_train['time_window_num'] = pd.cut(tmp_train[start_time_column_name], bins, labels=bin_names)
    tmp_test['time_window_num']  = pd.cut(tmp_test[start_time_column_name],  bins, labels=bin_names)
    ## create a row number column, start index is 1
    tmp_train['row_num'] = range(1, (tmp_train.shape[0] + 1))
    tmp_test['row_num']  = range(1, (tmp_test.shape[0] + 1))

    return tmp_train, tmp_test, bins, bin_names 
Beispiel #20
0
def try_peel(x,y,j,peel_alpha, box,direction):
    '''
    make a test peel box
    
    returns a tuple (mean, volume, box)
    '''
    alpha = 1/3
    beta = 1/3
    
    i=0
    if direction=='upper':
        peel_alpha = 1-peel_alpha
        i=1
    
    box_peel = mquantiles(x[:, j], [peel_alpha], alphap=alpha, betap=beta)[0]
    
    if direction=='lower':
        y_mean_peel = np.mean(y[ x[:, j] >= box_peel])
    if direction=='upper':
        y_mean_peel = np.mean(y[ x[:, j] <= box_peel])
    
    temp_box = copy.deepcopy(box)
    temp_box[i,j] = box_peel

    box_vol = vol_box(temp_box)
    
    return (y_mean_peel, box_vol, temp_box)
  def do_mock_distance (self, dset, theta, thetastar, rng):
    ncount = dset.get_data (0)
    data = []
    if ncount.get_len () > 0:
      if self.true_data:
        mass_vec = ncount.get_lnM_true ()
        z_vec = ncount.get_z_true ()
        for i in range (mass_vec.len ()):
          data.append ([z_vec.get (i), mass_vec.get (i)])
      else:
        mass_mat = ncount.get_lnM_obs ()
        z_mat = ncount.get_z_obs ()
        for i in range (mass_mat.nrows ()):
          data.append ([z_mat.get (i, 0), mass_mat.get (i, 0)])
    data = np.array (data)

    data_bin = np.array([ [ item[0] for item in data if item[1] >= self.dm_choose[i] ] for i in range (len (self.dm_choose) - 1)])
    mock_summary = [ mquantiles( elem, prob=self.quant_list ) if len( elem ) > 0  else [ 0 for jj in self.quant_list]  for elem in data_bin ]
    distance = [ np.sqrt( sum( [ ( self.data_summary[ i ][ j ] -  mock_summary[ i ][ j ] ) ** 2  for j in range( len( self.data_summary[ i ] ) ) ] ) ) for i in range( len( self.data_summary ) ) ]

    del ncount
    del data
    del data_bin
    del mock_summary

    return sum (distance)
Beispiel #22
0
def _get_par_summary(sim, n, probs):
    """Summarize chains merged and individually

    Parameters
    ----------
    sim : dict from stanfit object
    n : int
        parameter index
    probs : iterable of int
        quantiles

    Returns
    -------
    summary : dict
       Dictionary containing summaries
    """
    # _get_samples gets chains for nth parameter
    ss = _get_samples(n, sim, inc_warmup=False)
    msdfun = lambda chain: (np.mean(chain), np.std(chain, ddof=1))
    qfun = lambda chain: mquantiles(chain, probs)
    c_msd = np.array([msdfun(s) for s in ss]).flatten()
    c_quan = np.array([qfun(s) for s in ss]).flatten()
    ass = np.asarray(ss).flatten()
    msd = np.asarray(msdfun(ass))
    quan = qfun(np.asarray(ass))
    return dict(msd=msd, quan=quan, c_msd=c_msd, c_quan=c_quan)
Beispiel #23
0
def TigerCalculateEfficiency(list_tigerruns, N=1, beta=[0.95], background=0):
	"""
	CALCULATE EFFICIENCY FROM A LIST OF TIGERRUNS
	"""
	efficiencies = []
	OddsBeta=[mquantiles(list_tigerruns[background].odds(N),prob=[b]) for b in beta]
	efficiencies = empty((len(list_tigerruns)-1,len(beta)))
	for i in xrange(len(list_tigerruns)):
		if N>list_tigerruns[i].nsources:
			stdout.write("... Warning: Not sufficient events (%s) to calculate the efficiency for %s sources. Writing zeros\n"%(list_tigerruns[i].nsources,N))
			if i < background:
				efficiencies[i,:] = 0.0
			else:
				efficiencies[i-1,:] = 0.0
			continue
		if i != background:
			tmp = list_tigerruns[i].odds(N)
			for j in xrange(len(OddsBeta)):
				msk = tmp>OddsBeta[j]
				nmsk = tmp<OddsBeta[j]
				nabovebeta=len(tmp[msk])
				ntotal=len(tmp)
				eff=float(nabovebeta)/float(ntotal)
				if i < background:
					efficiencies[i,j] = eff
				else:
					efficiencies[i-1,j] = eff
	return efficiencies
Beispiel #24
0
def outlier_detection(q, time, mq, k=1.5):
    """
    calculates outlier's using geodesic distances of the SRSFs from the median

    :param q: numpy ndarray of N x M of M SRS functions with N samples
    :param time: vector of size N describing the sample points
    :param mq: median calculated using :func:`time_warping.srsf_align`
    :param k: cutoff threshold (default = 1.5)

    :return: q_outlier: outlier functions

    """
    N = q.shape[1]
    ds = zeros(N)
    for kk in range(0, N):
        ds[kk] = sqrt(trapz((mq - q[:, kk]) ** 2, time))

    quartile_range = mquantiles(ds)
    IQR = quartile_range[2] - quartile_range[0]

    thresh = quartile_range[2] + k * IQR

    ind = (ds > thresh).nonzero()

    q_outlier = q[:, ind]

    return q_outlier
Beispiel #25
0
    def plotTomo(self,ax=plt.gca()):
        #fig = figure(figsize=(7,5))
        #ax = fig.add_axes([0.1, 0.1, 0.8, 0.85])
        fig = ax.get_figure()
        #ax.set_yscale('log', nonposy='clip')


        lim =  mquantiles(self.G,0.99)  
        self.G[self.G>lim*1.5] = lim*1.5#most probably failure
        #dr =  np.mean(np.diff(self.rho_grid))
        #dr = 1
        power = self.G[:,::-1].T/1e6
        img = ax.imshow(power,      extent=[self.tvec[0],self.tvec[-1]
            ,self.rho_grid[0],self.rho_grid[-1]], aspect='auto',clim=[0,lim/1e6]) 
        minorLocator   = plt.MultipleLocator(1)
        img.set_cmap('YlOrBr')
        ax.xaxis.set_minor_locator(minorLocator)
        ax.axis([self.tvec[0],self.tvec[-1],0,1])
        cb1 = fig.colorbar(img)
        cb1.set_label('$P $ [MW/m$^3$]')
        
        ax.set_ylabel(r'$\rho_\phi$ [-]')
        ax.set_xlabel('t [s]')
        Rvec, Tvec = np.meshgrid(self.rho_grid, self.tvec)
        CS = ax.contour(Tvec,Rvec, self.G,10,colors = 'k',alpha=0.2)
        
        
        #plt.show()
        #fig.savefig('G%d.png'%self.shot)
        
        return fig
Beispiel #26
0
def _threshold_gradient(im):
    """Indicate pixel locations with gradient below the bottom 10th percentile

    Parameters
    ----------
    im : array
        The mean intensity images for each channel.
        Size: (num_channels, num_rows, num_columns).

    Returns
    -------
    array
        Binary values indicating whether the magnitude of the gradient is below
        the 10th percentile.  Same size as im.

    """

    if im.shape[0] > 1:
        # Calculate directional relative derivatives
        _, g_x, g_y = np.gradient(np.log(im))
    else:
        # Calculate directional relative derivatives
        g_x, g_y = np.gradient(np.log(im[0]))
        g_x = g_x.reshape([1, g_x.shape[0], g_x.shape[1]])
        g_y = g_y.reshape([1, g_y.shape[0], g_y.shape[1]])
    gradient_magnitudes = np.sqrt((g_x ** 2) + (g_y ** 2))
    below_threshold = []
    for chan in gradient_magnitudes:
        threshold = mquantiles(chan[np.isfinite(chan)].flatten(), [0.1])[0]
        below_threshold.append(chan < threshold)
    return np.array(below_threshold)
def maxdd_montecarlo(changes, runs=5000, length=None, serial_dependence=None, 
  quantiles=(0.75, 0.9, 0.975), return_array=False):
    if not length:
        length = len(changes)
    if not serial_dependence:
        seq = changes
        pick = lambda seq: [random.choice(seq)]
    else:
        # Serial dependance detected? Lets sample windows!
        class serial_sampler(object):
            def __init__(self, seq, size):
                self.seq = seq
                self.size = size
            def __len__(self):
                return len(self.seq) - self.size
            def __getitem__(self, i):
                return self.seq[i - self.size : i + self.size]
        pick = lambda seq: random.choice(seq)
        seq = serial_sampler(changes, serial_dependence)
    maxdds = []
    for i in xrange(runs):
        # sample a maxdd
        new_seq = []
        while len(new_seq) < length:
            new_seq += pick(seq)
        maxdds.append(maxdd(new_seq))
    results = {
        'mean maxdd': numpy.mean(maxdds),
        'sd of maxdds': numpy.std(maxdds),
        'quantiles': dict(zip(quantiles, mquantiles(maxdds, quantiles)))
        }
    if return_array:
        results['array of maxdd samples'] = maxdds
    return results
def sample_n_genes_quartile(sample_size, sample, blast_report_suffix,quartile, RefSeq, blast_report_dir):
	#print "sampling this many:\t" + str(sample_size) + " genes for this sample\t" + sample + "\n"

	RefSeq_to_percent_covered_hash = make_refseq_to_percent_covered_hash(RefSeq,blast_report_suffix,blast_report_dir,sample)
	percent_covered_keys = RefSeq_to_percent_covered_hash.keys()
	percent_covered_scores = RefSeq_to_percent_covered_hash.values()
	
	quantiles = mquantiles(percent_covered_scores)
	
	range = ()
	if(quartile == "lower"):
		range = (0,quantiles[0])			
	elif(quartile == "median"):
		range = (quantiles[0],quantiles[1])	
	elif (quartile == "upper"):
		range = (quantiles[1],0)
				
	sampled_counter = 0
	sampled_list = {}
	if(len(RefSeq_to_percent_covered_hash.keys()) < sample_size):
		sample_size = len(RefSeq_to_percent_covered_hash.keys())

	while sampled_counter < sample_size:
		curr = percent_covered_keys[randint(0,len(percent_covered_scores))]
		curr_score = RefSeq_to_percent_covered_hash[curr]
		if(curr_score > range[0] and curr_score < range[1]):
			sampled_list[curr] = 1
			sampled_counter = sampled_counter + 1

	return sampled_list.keys()	
def realPeel(x,y,n,name,peel_alpha, box,direction, obj_func):
    '''
    make a test peel box
    
    returns a tuple (mean, volume, box)
    '''
    alpha = 1/3
    beta = 1/3
    
    i=0
    if direction=='upper':
        peel_alpha = 1-peel_alpha
        i=1

    box_peel = mquantiles(x[name], [peel_alpha], alphap=alpha, betap=beta)[0]
    if direction=='lower':
        logical = x[name] >= box_peel
    if direction=='upper':
        logical = [x[name] <= box_peel]

    obj = obj_func(y,  y[logical])    
    temp_box = np.copy(box)
    temp_box[name][i] = box_peel
    box_mass = y[logical].shape[0]/n
    box_vol = box_mass
#    box_vol = vol_box(temp_box)
    return (obj, box_vol, temp_box, logical)
Beispiel #30
0
def safety_production(CS, seuil):
    safety_production = 0
    for i in range(len(CS)):
        data = summ(CS[i]).values
        XXX = mquantiles(data,[seuil])
        safety_production += 10**-7 * XXX[0]
    return safety_production
Beispiel #31
0
    def calibrate(self, X, Y, alpha, bbox=None, return_scores=False):
        if bbox is not None:
            self.init_bbox(bbox)

        # Store desired nominal level
        self.alpha = alpha

        # Compute predictions on calibration data
        q_calib = self.bbox.predict(X.astype(np.float32))

        # Estimate conditional histogram for calibration points
        d_calib = self.hist.compute_histogram(q_calib, self.ymin, self.ymax,
                                              alpha)

        # Initialize histogram accumulator (grey-box)
        accumulator = HistogramAccumulator(d_calib,
                                           self.grid_histogram,
                                           self.alpha,
                                           delta_alpha=self.delta_alpha)

        # Generate noise for randomization
        n2 = X.shape[0]
        if self.randomize:
            epsilon = np.random.uniform(low=0.0, high=1.0, size=n2)
        else:
            epsilon = None

        # Compute conformity scores
        if self.intervals:
            scores = accumulator.calibrate_intervals(Y.astype(np.float32),
                                                     epsilon=epsilon)
        else:
            # TODO: implement this
            assert (1 == 2)

        # Compute upper quantile of scores
        level_adjusted = (1.0 - alpha) * (1.0 + 1.0 / float(n2))
        self.calibrated_alpha = np.round(
            1.0 - mquantiles(scores, prob=level_adjusted)[0], 4)

        # Print message
        print("Calibrated alpha (nominal level: {}): {:.3f}.".format(
            alpha, self.calibrated_alpha))

        return self.calibrated_alpha
Beispiel #32
0
def get_split(data, variables, y_variable, min_samples_leaf, n_quantiles):
	
	variance = np.var(data[y_variable])
	split_value = None

	for variable in variables:
		value_list = data[variable]
		if len(np.unique(value_list))>n_quantiles:

			probs = [j/float(n_quantiles) for j in range(1,n_quantiles+1)]
			values = sc_st_mst.mquantiles(value_list,probs)

		else:
			if len(np.unique(value_list))==1:
				continue
			values = np.unique(value_list)	
				
		for value in values[:-1]:

			data_with_value = data[data[variable] <= value]
			data_without_value =  data[data[variable] > value]
			without_len = len(data_without_value.index)
			with_len = len(data_with_value.index)
			if (with_len < min_samples_leaf) or (without_len < min_samples_leaf):
				continue	
		
			### Ratios of each value of specified variable
			ratio = with_len/float(len(data.index))

			### split_entropy shows how good split seperates class_values in generaly 
			
			split_variance =  ratio*np.var(data_with_value[y_variable])+(1-ratio)*np.var(data_without_value[y_variable])

				
			if split_variance < variance :
				variance = split_variance
				split_variable = variable
				split_value = value

	if split_value == None:
		return None		
				
	

	return  split_variable, split_value, variance
Beispiel #33
0
def binify_even_bin(X, N=10, dm=None, maxlag=None, **kwargs):
    """
    Returns a distance matrix with all entries sorted into bin numbers, along with an array of bin widths.
    The matrix has the same form as the distance matrix dm in squareform. The bins will be indexed from 0 to n.
    If dm is None, then the point_dist function will be used to calculate a distance matrix. kwargs will be passed to point_matrix.
    For the bins, either N or w has to be given. N specifies the number of bins and w their width. If both are given,
    N bins of width w will be specified, which might result in unexpected results.

    :param X: np.array of x, y coordinates.
    :param N: int with the number of bins
    :param dm: numpy.ndarray with the distance matrix
    :param maxlag: maximum lag for the binning
    :param kwargs: will be passed to calculate the point_matrix if no dm is given
    :return:
    """

    _X = list(X)

    # check that all coordinates in the list have the same dimension and are not empty
    if not len(set([len(e) for e in _X])) == 1 or len(_X[0]) == 0:
        raise ValueError(
            "One or more Coordinates are missing.\nPlease provide the coordinates for all values "
        )

    # get the distance matrix
    if dm is None:
        _dm = nd_dist(_X, **kwargs)
    else:
        _dm = dm

    # create bin matrix as copy of dm
    bm = copy.deepcopy(_dm)

    # get the upper bounds by calculating the quantiles of the upper bounds
    binubound = mquantiles(np.array(_dm).flatten(),
                           prob=[i / N for i in range(1, N + 1)])

    # set all bins except the first one
    for i in range(1, N):
        bm[(_dm > binubound[i - 1]) & (_dm <= binubound[i])] = i

    # set the first bin
    bm[_dm < binubound[0]] = 0

    return np.matrix(bm), np.diff([0, *binubound])
Beispiel #34
0
 def test_mquantiles_limit_keyword(self):
     # Regression test for Trac ticket #867
     data = np.array([[6., 7., 1.],
                      [47., 15., 2.],
                      [49., 36., 3.],
                      [15., 39., 4.],
                      [42., 40., -999.],
                      [41., 41., -999.],
                      [7., -999., -999.],
                      [39., -999., -999.],
                      [43., -999., -999.],
                      [40., -999., -999.],
                      [36., -999., -999.]])
     desired = [[19.2, 14.6, 1.45],
                [40.0, 37.5, 2.5],
                [42.8, 40.05, 3.55]]
     quants = mstats.mquantiles(data, axis=0, limit=(0, 50))
     assert_almost_equal(quants, desired)
def convert_to_8bit(img_array):
    """Converts to 8 bit, but strething the contrast according to image stats"""
    img_array = img_array.astype('float32')
    int_quant = mquantiles(img_array.ravel(), [0.01, 0.99])

    # if the image is flat return the image or 255
    if int_quant[0] == int_quant[1]:
        flat_field = np.min(img_array.max(), 255)
        return flat_field*np.ones_like(img_array)
    # Remove outliers
    img_array[img_array < int_quant[0]] = int_quant[0]
    img_array[img_array > int_quant[1]] = int_quant[1]

    img_array -= img_array.min()
    img_array /= img_array.max()
    img_array *= 255

    return img_array
 def _set_thresholds(self, newthresholds=None):#low,high):
     "Defines the indicator thresholds for the definition of ENSO phases."
     _optinfo = self.optinfo
     if (newthresholds is not None):
         try:
             (low, high) = newthresholds
         except:
             raise TypeError("The input thresholds must be given as a "\
                             "sequence (low, high)")
         if low > high:
             (low, high) = (high, low)
         thresholds = (float(low), float(high))
     else:
         thresholds = mquantiles(self._series, (.25, .75), axis=None)
     if thresholds != _optinfo.get('thresholds', None):
         self._cachedmonthly = {}
         self._cachedcurrent = None
     _optinfo['thresholds'] = thresholds
Beispiel #37
0
def plot_wwadist(wwa):
    ''' Plot the distribution of wwa with the 95% quantile line.

    Args:
        wwa (array): the weighted wavelet amplitude.

    Returns:
        fig (figure): the 2-D plot of wavelet analysis

    '''
    sns.set(style="darkgrid", font_scale=2)
    plt.subplots(figsize=[20, 4])

    q95 = mstats.mquantiles(wwa, 0.95, alphap=0.5, betap=0.5)
    fig = sns.distplot(np.nan_to_num(wwa.flat))
    fig.axvline(x=q95, ymin=0, ymax=0.5, linewidth=2, linestyle='-')

    return fig
Beispiel #38
0
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
    """Generate a grid of points based on the ``percentiles of ``X``.
    The grid is generated by placing ``grid_resolution`` equally
    spaced points between the ``percentiles`` of each column
    of ``X``.
    Parameters
    ----------
    X : ndarray
        The data
    percentiles : tuple of floats
        The percentiles which are used to construct the extreme
        values of the grid axes.
    grid_resolution : int
        The number of equally spaced points that are placed
        on the grid.
    Returns
    -------
    grid : ndarray
        All data points on the grid; ``grid.shape[1] == X.shape[1]``
        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
    axes : seq of ndarray
        The axes with which the grid has been created.
    """
    if len(percentiles) != 2:
        raise ValueError('percentile must be tuple of len 2')
    if not all(0. <= x <= 1. for x in percentiles):
        raise ValueError('percentile values must be in [0, 1]')

    axes = []
    for col in range(X.shape[1]):
        uniques = np.unique(X[:, col])
        if uniques.shape[0] < grid_resolution:
            # feature has low resolution use unique vals
            axis = uniques
        else:
            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
            # create axis based on percentiles and grid resolution
            axis = np.linspace(emp_percentiles[0, col],
                               emp_percentiles[1, col],
                               num=grid_resolution,
                               endpoint=True)
        axes.append(axis)

    return cartesian(axes), axes
Beispiel #39
0
    def process_outliers(self, mode='zscore'):
        """
        Check for outliers in calculated summary stats. Outliers are the few very high or very low values that can
        potentially introduce bias in tasks such as parameter inference. One can either remove them, replace with mean
        value, or use log scale for the statistic in question. This choice is left to the user.
        
        Parameters
        ----------
        mode : str, optional
            Either use 'z-score' or inter-quantile range 'iqr', by default 'zscore'
        
        Returns
        -------
        array
            Indices of dataset.s columns containing outliers
        """
        if mode == 'zscore':
            # This will give us per-feature/per-statistic z-scores
            zscores = zscore(self.s, axis=0)

            # Find columns where abs(zscore) > threshold
            zscore_threshold = 3
            violation_indices = np.argwhere(np.abs(zscores) > zscore_threshold)
            if len(violation_indices) < 1:
                return
            outlier_indices = np.unique(np.argwhere(np.abs(zscores) > zscore_threshold)[:, 1])
        else:
            # Outlier detection using IQR
            quants = mquantiles(self.s)
            iqr = quants[2] - quants[0]
            iqr_factor = 1.5
            violations_left = self.s < quants[0] - iqr_factor * iqr
            violations_right = self.s > quants[2] + iqr_factor * iqr
            violation_indices = np.argwhere(violations_left | violations_right)
            if len(violation_indices) < 1:
                return
            outlier_indices = np.unique(np.argwhere(violations_left | violations_right)[:, 1])

        if len(outlier_indices) > 0:
            self.outlier_column_indices = outlier_indices
            print('Dataset:process_outliers: found outliers at indice(s) {}'.format(outlier_indices))
            print('Outliers can be transformed using the function Dataset.apply_func_to_outlier_columns if so desired.')

        return self.outlier_column_indices
Beispiel #40
0
def nearest_neighbors(lab_im, n=3, quantiles=[0.05, 0.25, 0.5, 0.75, 0.95]):
    """Find the distances to and angle between the n nearest neighbors.

    Parameters
    ----------
    lab_im : 2D array of int
        An image of labeled objects.
    n : int, optional
        How many nearest neighbors to check. (Angle is always between
        the two nearest only.)
    quantiles : list of float in [0, 1], optional
        Which quantiles of the features to compute.

    Returns
    -------
    nei : 1D array of float, shape (5 * (n + 1),)
        The quantiles of sines, cosines, angles, and `n` nearest neighbor
        distances.
    names : list of string
        The name of each feature.
    """
    if lab_im.dtype == bool:
        lab_im = nd.label(lab_im)[0]
    centroids = np.array(
        [p.centroid for p in measure.regionprops(lab_im, coordinates='rc')])
    nbrs = (NearestNeighbors(n_neighbors=(n + 1),
                             algorithm='kd_tree').fit(centroids))
    distances, indices = nbrs.kneighbors(centroids)
    angles = triplet_angles(centroids, indices[:, :3])
    # ignore order/orientation of vectors, only measure acute angles
    angles[angles > np.pi] = 2 * np.pi - angles[angles > np.pi]
    distances[:, 0] = angles
    sines, cosines = np.sin(angles), np.cos(angles)
    features = np.hstack(
        (sines[:, np.newaxis], cosines[:, np.newaxis], distances))
    nei = mquantiles(features, quantiles, axis=0).ravel()
    colnames = (['sin-theta', 'cos-theta', 'theta'] +
                ['d-neighbor-%i-' % i for i in range(1, n + 1)])
    names = [
        '%s-percentile-%i' % (colname, int(q * 100))
        for colname, q in it.product(colnames, quantiles)
    ]
    return nei, names
Beispiel #41
0
def get_stats(arr):
    sz = arr.size
    amin, amax = arr.min(), arr.max()
    q = ms.mquantiles(arr, [0.1, 0.5, 0.9])
    mu = arr.mean()
    sigma = arr.std()
    cv = sigma / mu

    return {
        "size": sz,
        "min": amin,
        "max": amax,
        "pct10": q[0],
        "pct50": q[1],
        "pct90": q[2],
        "mu": mu,
        "sigma": sigma,
        "cv": cv
    }
Beispiel #42
0
def compute_CDF_quantiles(CDFs, confidence=95.0):
    """
        Takes a 2D array of CDFs of size (N_bs, N_bins).
        N_bs stands for the number of bootstraps
        N_bins stands for the number of bins within the CDF.
        Returns the median, lower and upper bounds at the desired confidence level.
    """

    # Create percentiles:
    lower_percentile = (1. - confidence / 100.) / 2.
    upper_percentile = 1. - lower_percentile
    # Compute the percentiles for each bin
    q = mstats.mquantiles(CDFs.T,
                          prob=[lower_percentile, 0.5, upper_percentile],
                          axis=1)
    lower = q.T[0]
    median = q.T[1]
    upper = q.T[2]
    return median, lower, upper
Beispiel #43
0
def qq(data, ax, color):
    xmax = 0
    ymax = 0
    alpha = 0.9
    color = '#000000'
    n_quantiles = 100

    q_pos = np.concatenate([
        np.arange(99.) / len(data),
        np.logspace(-np.log10(len(data)) + 2, 0, n_quantiles)
    ])

    q_data = mquantiles(data, prob=q_pos, alphap=0, betap=1, limit=(0, 1))
    q_th = q_pos.copy()
    q_err = np.zeros([len(q_pos), 2])
    for i in range(0, len(q_pos)):
        q_err[i, :] = q_err[i, :] = beta.interval(
            alpha,
            len(data) * q_pos[i],
            len(data) - len(data) * q_pos[i])

    q_err[i, q_err[i, :] < 0] = 1e-15
    slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data)
    xmax = np.max([xmax, -np.log10(q_th[1])])
    ymax = np.max([ymax, -np.log10(q_data[0])])

    ax.plot(-np.log10(q_th[n_quantiles - 1:]),
            -np.log10(q_data[n_quantiles - 1:]),
            '-',
            color=color)
    ax.plot(-np.log10(q_th[:n_quantiles]),
            -np.log10(q_data[:n_quantiles]),
            '.',
            color=color,
            label='gf')
    ax.plot([0, xmax], [0, xmax], '--k', color='#f42e30')
    ax.fill_between(
        -np.log10(q_th),
        -np.log10(q_err[:, 0]),
        -np.log10(q_err[:, 1]),
        color=color,
        alpha=0.1,
    )
Beispiel #44
0
def main(argv=None):
	parser=argparse.ArgumentParser(description="Compute various statistics related to the sequences either in the provided fasta files or for the sequences piped in")
#	parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
	parser.add_argument('-p',dest="pretty",action="store_true",help="Pretty print using PrettyTable module")
	parser.add_argument('-d',dest="delimiter",help="Colum separator for output, default to whitespace",default=" ")
	parser.add_argument('-t',dest="min_length",help="Minimun length threshold to filter fasta file",default=0,type=int)
	parser.add_argument('-r',dest="reference_length",help="(Not yet implemented)Reference length used to compute corrected Nx values",default=0)
	parser.add_argument('-o', nargs='?', type=argparse.FileType('w'), default=sys.stdout,dest="outfile")
	parser.add_argument('FASTAFILE',action='append',nargs="+",help='List of fasta files to keep. Use "*" to keep them all')
	args=parser.parse_args()
	all_records=[]
	FASTAFILE=args.FASTAFILE[0]
	if args.pretty:
		import prettytable

	for f in FASTAFILE: 
		for record in SeqIO.parse(f, "fasta", generic_dna):
			if len(record.seq)<=args.min_length:
				continue
			all_records.append(SequenceStat(f,record))
	# Display summary statistics per file
	sequences_per_files=collections.defaultdict(list)
	for s in all_records:
		sequences_per_files[s.file].append(s)
	if args.pretty:
		table=prettytable.PrettyTable(["File","#Seqs","Avg GC","Avg Length(kb)", "Quant","min","max",  "Sum Length(kb)","N50(kb)","L50"])
		table.align["File"] = "l" 

		for file,seqs in sequences_per_files.items():
			lengths=[x.length for x in seqs]
			table.add_row([file,len(seqs),round(scipy.average([x.gc for x in seqs]),2),\
				round(scipy.average(lengths)/1000,2),mquantiles(lengths),min(lengths),max(lengths),round(sum(lengths)/1000,2),round(N50.N50(lengths)/1000,2),N50.L50(lengths)])
		print >>args.outfile,table.get_string(sortby="N50(kb)")

	else:
		for file,seqs in sequences_per_files.items():
			lengths=[x.length for x in seqs]

			print >>args.outfile," ".join(map(str,[\
				file,len(seqs),scipy.average([x.gc for x in seqs]),\
				scipy.average(lengths),sum(lengths),N50.N50(lengths),N50.L50(lengths)
				]))
Beispiel #45
0
def plotInfo(listex,listey, title):
    
    x=np.array(listex)
    y=np.array(listey)

    #calcul des valeurs
    n = len(x)
    mean = np.mean(x)
    var = np.var(x)
    
    
    f = plt.figure()
    ax = f.add_subplot(111)    
    
    quantiles=ssm.mquantiles(x)

    
    leTexte = (
    "nbPoint: " + '%.0f' % n +"\n"    
    + "mean: " + '%.4f' % mean +"\n"
    + "var: " + '%.4f' % var + "\n"
    + "quantile1: " + '%.4f' % quantiles[0] + "\n"
    + "quantile2: " + '%.4f' % quantiles[2]
    )  
    
    #affichage des infos
    plt.text(0.82,0.80,leTexte,horizontalalignment='center',
     verticalalignment='center', transform = ax.transAxes)

    #affichage du graph
    plt.scatter(x,y, s = 7)
    
    #affichage des quantiles
    plt.axvline(x=quantiles[0], linewidth=3, color='g')    
    plt.axvline(x=quantiles[2], linewidth=3, color='g')  
    
    #labels
    plt.xlabel("Stickiness")
    plt.ylabel("Abondance")
    plt.title(title)
    
    plt.savefig(title)    
Beispiel #46
0
def PredictAgeFreq(hdf5file,nAnimal,burn=0,quantile=[.025,.5,.975],MinYear=1875,MaxYear=1980):
    
    nyear=MaxYear-MinYear+1
    PredAgeFreq=[]
    for FileName in hdf5file:
        print(FileName)
        f=tables.open_file(FileName,mode='r+')
        nTable=len(f.list_nodes('/'))
	          #Names of tables
        curName='//chain0//PyMCsamples'
        try:
          curTable=f.get_node(curName)
        except:
          print('PredictAgeFreq 28 ',FileName)
          curTable=f.get_node(curName)

        i=0
        for t in curTable.iterrows():
            if i>=burn:
                  try:
                      LogRecruit=[t['LogRecruit_'+str(s)]  for s in range(MinYear,1+MaxYear)]
                  except:
                      LogRecruit=[t['LogRecruit'+str(s)]  for s in range(MinYear,1+MaxYear)]
                  lnM=t['lnM']
                  M=exp(lnM)
                  
                  #UnNormalized probabilities
                  UnNorm=[ exp(t+M*(y-nyear/2)) for y,t in enumerate(LogRecruit)]
                  NormProb=[t/sum(UnNorm)  for t in UnNorm]
                  
                  #Random Age Frequency
                  CurAgeFreq=list(multinomial(nAnimal, NormProb).rvs()[0])
                  PredAgeFreq+=[CurAgeFreq]  
                  i+=1 
                  
    #Quantiles on number of animals for every age-class
    qanimal=[ mquantiles( [t[i]  for t in PredAgeFreq],prob=quantile)   for i in range(nyear)]
      
    result={}
    for i,t in enumerate(quantile):
          result[t]=[s[i]   for s in qanimal]
    return(result)
Beispiel #47
0
def check_qual(qual):
    """check seq for mean quality < 25 
       drop qual scores in lowest decile
    """
    
    quals = []
    for i in qual:
        quals.append(ord(i) - 33)

    # drop lowest decile of qual scores
    decile = float(mquantiles(quals, prob = [0.1]))
    
    quals = [x for x in quals if x > decile]

    mean_qual = float(sum(quals)) / max(len(quals), 1)
    
    if mean_qual < 25:
        return False
    else:
        return True
Beispiel #48
0
def calc_nonlin(spikes, generator, nr_bins=20):
    """
    Calculate nonlinearities from the spikes and the generator signal.
    Bins for the generator are defined such that they contain equal number
    of samples. Since there are fewer samples for more extreme values of the
    generator signal, bins get wider.
    """

    quantiles = np.linspace(0, 1, nr_bins + 1)

    # m stands for masked, to be able to apply the function
    # to masked numpy arrays. In practice, masked arrays are rarely needed.
    quantile_bins = mquantiles(generator, prob=quantiles)

    res = binned_statistic(generator, spikes, bins=quantile_bins)

    nonlinearity = res.statistic
    bins = bin_midpoints(quantile_bins)

    return nonlinearity, bins
Beispiel #49
0
    def finalize(self):
        tot = 0
        vals = []
        for x in self.states.values():
            vals.append(x.score)
        assert len(vals) > 0
        #assert tot >= 0.1
        limv = mquantiles(vals, 0.90)[0] / 3
        #limv = 1e-9
        tot = np.sum(vals)
        if tot < 1e-9: return False

        nstates = dict()
        for k, v in self.states.items():
            if v.score < limv: continue
            v.score /= tot
            nstates[k] = v
        assert len(nstates) > 0, vals
        self.states = nstates
        return True
Beispiel #50
0
def grid_from_X(x, percentiles=(0.05, 0.95), grid_resolution=100):
    """Generate a grid of points based on the ``percentiles of ``x``.
    """
    x = x[~x.isnull()]
    if len(percentiles) != 2:
        raise ValueError('percentile must be tuple of len 2')
    if not all(0. <= x <= 1. for x in percentiles):
        raise ValueError('percentile values must be in [0, 1]')

    uniques = np.unique(x)
    if uniques.shape[0] < grid_resolution:
        # feature has low resolution use unique vals
        return uniques
    else:
        emp_percentiles = mquantiles(x, prob=percentiles)
        # create axis based on percentiles and grid resolution
        return np.linspace(emp_percentiles[0],
                           emp_percentiles[1],
                           num=grid_resolution,
                           endpoint=True)
    def h_smooth(self, mag):
        '''
        Function to calculate smoothing coefficient (h) for Gaussian
        Kernel estimation - based on Silverman (1986) formula

        :param numpy.ndarray mag:
            Magnitude vector

        :returns:
            Smoothing coefficient (h) (float)
        '''
        neq = np.float(len(mag))

        # Calculate inter-quartile range
        qtiles = mquantiles(mag, prob=[0.25, 0.75])
        iqr = qtiles[1] - qtiles[0]
        hfact = 0.9 * np.min([np.std(mag), iqr / 1.34]) * (neq**(-1. / 5.))
        # Round h to 2 dp
        hfact = np.round(100. * hfact) / 100.
        return hfact
def h_smooth(mag):
    """
    Function to calculate smoothing coefficient (h)
    for Gaussian Kernel estimation - based on Silverman (1986) formula.

    :param mag: Magnitude vector
    :type mag: numpy.ndarray
    :return hfact: Smoothing coefficient (h)
    :rtype hfact: Float
    """

    neq = np.float(np.shape(mag)[0])

    # Calculate inter-quartile range
    qtiles = mquantiles(mag, prob=[0.25, 0.75])
    iqr = qtiles[1] - qtiles[0]
    hfact = 0.9 * np.min([np.std, iqr / 1.34]) * (neq**(-1. / 5.))
    # Round h to 2 dp
    hfact = np.round(100. * hfact) / 100.
    return hfact
def bin_data_in_energy(dtf, n_bins=20):
    '''
    Bin the data in dtf to n_bins with equal statistics.

    Parameters
    ----------
    dtf: pandas DataFrame
        The DataFrame containing the data.
        Must contain a 'log_reco_energy' column (used to calculate the bins).
    n_bins: int, default=20
        The number of reconstructed energy bins to divide the data in.

    Returns
    -------
    A dictionary of DataFrames (keys=energy ranges, values=separated DataFrames).
    '''

    dtf_e = dict()

    log_e_reco_bins = mstats.mquantiles(dtf['log_reco_energy'].values, np.linspace(0, 1, n_bins))

    for i_e_bin, log_e_high in enumerate(log_e_reco_bins):
        if i_e_bin == 0:
            continue

        mask = np.logical_and(
            dtf['log_reco_energy'] > log_e_reco_bins[i_e_bin - 1],
            dtf['log_reco_energy'] < log_e_high
        )
        this_dtf = dtf[mask]
        if len(this_dtf) < 1:
            raise RuntimeError('One of the energy bins is empty')

        this_e_range = '{:3.3f} < E < {:3.3f} TeV'.format(
            10**log_e_reco_bins[i_e_bin - 1],
            10**log_e_high
        )

        dtf_e[this_e_range] = this_dtf

    return dtf_e
Beispiel #54
0
def make_plot(df, site, mcmc_traces, x_range, depth, to_screen):

    #pm.traceplot(mcmc_traces)

    # posteriors for the parameters
    a_post = mcmc_traces["a"][:, None]
    b_post = mcmc_traces["b"][:, None]

    # mean prediction
    beta_pred = np_sigmoid(x_range, a_post, b_post)
    mean_pred = beta_pred.mean(0)

    # vectorized bottom and top 2.5% quantiles for "confidence interval"
    quantiles = mquantiles(beta_pred, [0.025, 0.975], axis=0)

    if to_screen:

        pm.traceplot(mcmc_traces)

        plt.figure(figsize=(10, 6))
        plt.fill_between(x_range, *quantiles, alpha=0.7, color="salmon")
        plt.plot(x_range, mean_pred, lw=2, ls="-", color="crimson")
        plt.scatter(df.sw.values, df.beta.values, color="k", s=50, alpha=0.5)
        plt.xlim(x_range.min(), x_range.max())
        plt.ylim(-0.02, 1.02)
        plt.xlabel("SW")
        plt.ylabel("Beta")
        plt.show()
    else:
        plt.figure(figsize=(10, 6))
        plt.fill_between(x_range, *quantiles, alpha=0.7, color="salmon")
        plt.plot(x_range, mean_pred, lw=2, ls="-", color="crimson")
        plt.scatter(df.sw.values, df.beta.values, color="k", s=50, alpha=0.5)
        plt.xlim(x_range.min(), x_range.max())
        plt.ylim(-0.02, 1.02)
        plt.xlabel("SW")
        plt.ylabel("Beta")
        plt.savefig("plots/%s_%s.png" % (site, depth), dpi=100)

        pm.traceplot(mcmc_traces)
        plt.savefig("plots/%s_%s_posterior.png" % (site, depth), dpi=100)
Beispiel #55
0
def smoothScatterCalcDensity(x, nbin, bandwidth=None, rangex=None):
    '''
    Preprocessing step for kde function:
        'nbin' initialization,
        'bandwidth' initializtion and validation

    x : numpy array [shape = (2,N)] - array with coordinates of the points
    nbin : int or [int, int] - number of bins along both axis (in case single value - [nbin, nbin] is used)
    bandwidth : [optional] numeric positive array of size 2 with smoothing bandwidth

    return  axes - pair of lists with axis breakpoints
            fhat - binning Kernel Density Estimation matrix (squared)
            bandwidth - smoothing bandwidth (own estimation in case of initial bandwidth = None)
    Source: R::KernSmooth::smoothScatterCalcDensity
    '''
    if isinstance(nbin, numbers.Number):
        nbin = (nbin, nbin)
    elif (isinstance(nbin, list)
          and len(nbin) == 1) or (isinstance(nbin, np.ndarray)
                                  and len(nbin) == 1):
        nbin = (nbin[0], nbin[0])
    if len(nbin) != 2 or not (isinstance(nbin[0], numbers.Number)
                              and isinstance(nbin[1], numbers.Number)):
        raise ValueError("'nbin' must be numeric of length 1 or 2")
    if bandwidth is None:
        # R compatibility
        q_data = mquantiles(x, prob=[0.05, 0.95], alphap=1, betap=1,
                            axis=0).data
        bandwidth = np.diff(q_data, axis=0) / 25
        bandwidth[bandwidth == 0] = 1
        bandwidth = bandwidth[0]
    else:
        if not (isinstance(bandwidth, numbers.Number)
                or isinstance(bandwidth, np.ndarray)):
            raise ValueError("'bandwidth' must be numeric")
        if isinstance(bandwidth,
                      np.ndarray) and len(bandwidth[bandwidth <= 0]) > 0:
            raise ValueError("'bandwidth' must be positive")
    rv = bkde2D(x, bandwidth=bandwidth, gridsize=nbin, rangex=rangex)
    # return axes, fhat, bandwidth
    return rv[0], rv[1], bandwidth
Beispiel #56
0
def plot_features(FFSen,
                  baseline_shape,
                  baseline_mask,
                  llimit=0.01,
                  ulimit=0.99,
                  num_features=32,
                  xmin=200,
                  xmax=1600):
    """
    Visualize the sensitivity maps for the hidden layer units.
    :param FFSen:
    :param llimit:
    :param ulimit:
    :param num_features:
    :param xmin:
    :param xmax:
    :return:
    """
    cols = 2
    rows = num_features / cols

    plt.style.use('ggplot')
    plt.figure()

    plt.cla()
    for j, input in enumerate(FFSen[0:num_features, :]):
        input = input - np.mean(input, axis=0)
        input = input / np.max(np.abs(input)) + 1e-32
        quantiles = mquantiles(input, [llimit, ulimit])
        wt_vol = get3DVol(input, baseline_shape, baseline_mask)
        plt.subplot(rows, cols, j + 1)
        im = plt.imshow(wt_vol[:, xmin:xmax],
                        cmap=plt.cm.RdBu_r,
                        aspect='auto',
                        interpolation='none',
                        vmin=-0.06,
                        vmax=0.06)
        plt.grid()
        im.set_clim(quantiles[0], quantiles[1])
        plt.axis('off')
    plt.show()
Beispiel #57
0
def row_stats(row, as_strings=True, engin=False):
    q1, q2, q3 = mquantiles(row)
    stats = {}
    stats["N"] = len(row)
    stats["#0s"] = len([k for k in row if abs(k) < c_eps])
    stats["%0s"] = stats["#0s"] / float(len(row))
    stats["Sum"] = sum(row)
    stats["Min"] = min(row)
    stats["Q1"] = q1
    stats["Q2_Med"] = q2
    stats["Q3"] = q3
    stats["Max"] = max(row)
    stats["Mean"] = mean(row)
    stats["StDev"] = std(row)
    stats[
        "CfVar"] = stats["StDev"] / stats["Mean"] if stats["Mean"] != 0 else 0
    if set(stats.keys()) != set(c_props):
        die("Inconsistent stat lists. Check code.")
    if as_strings:
        stats = {k: pretty(v, engin) for k, v in stats.items()}
    return stats
Beispiel #58
0
def gauss_degrade(image,margin=1.0,change=None,noise=0.02,minmargin=0.5,inner=1.0):
    if image.ndim==3: image = mean(image,axis=2)
    m = mean([amin(image),amax(image)])
    image = 1*(image>m)
    if margin<minmargin: return 1.0*image
    pixels = sum(image)
    if change is not None:
        npixels = int((1.0+change)*pixels)
    else:
        edt = distance_transform_edt(image==0)
        npixels = sum(edt<=(margin+1e-4))
    r = int(max(1,2*margin+0.5))
    ri = int(margin+0.5-inner)
    if ri<=0: mask = binary_dilation(image,iterations=r)-image
    else: mask = binary_dilation(image,iterations=r)-binary_erosion(image,iterations=ri)
    image += mask*randn(*image.shape)*noise*min(1.0,margin**2)
    smoothed = gaussian_filter(1.0*image,margin)
    frac = max(0.0,min(1.0,npixels*1.0/prod(image.shape)))
    threshold = mquantiles(smoothed,prob=[1.0-frac])[0]
    result = (smoothed>threshold)
    return 1.0*result
def find_empirical_equiprobable_bins_midpoints(N, data):
    '''
    N number of equiprobable bins and data, return the cutoffs and conditional
    expectation nodes (midpoints), and empirical probability of each bin.
    NOTE that the empirical probability will likely *not* be equal, due to the
    nature of the empirical data. As N_data -> infty, this will converge to the
    appropriate "true" equiprobable discrete value due to properties of the
    ECDF.
    
    Nathan M. Palmer
    '''
    # Get initial cutoffs:
    cutoffs0 = np.linspace(0, 1, (N + 1))
    # Need to plug into the inverse ecdf

    cutoffs = mquantiles(a=data,
                         prob=cutoffs0,
                         alphap=1.0 / 3.0,
                         betap=1.0 / 3.0)
    # mquantiles(a, prob=[0.25, 0.5, 0.75], alphap=0.4, betap=0.4, axis=None, limit=())
    #  (alphap,betap) = (1/3, 1/3): p(k) = (k-1/3)/(n+1/3): Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8)

    # Set infinite upper and lower cutoffs:
    cutoffs[0] = -np.inf
    cutoffs[-1] = np.inf

    # Init containers
    EX = []
    pX = []

    for lo, hi in zip(cutoffs[:-1], cutoffs[1:]):
        bin_indx = np.logical_and(data >= lo, data < hi)
        EX.append(np.mean(data[bin_indx]))  # Should converge to correct
        pX.append(np.mean(bin_indx))  # Should also converge to proper

    EX = np.array(EX)
    pX = np.array(pX)

    return EX, cutoffs[
        1:-1], pX  # want to slice off the -inf and inf bin cutoffs
Beispiel #60
0
    def get_potential_energy(self, atoms, output=(.5, )):
        """Returns the potential energy from the ensemble for the atoms
        object.

        By default only returns the median prediction (50th percentile)
        of the ensemble, such that it works like a normal ASE calculator.
        To get uncertainty information, use the output keyword with the
        following codes:

            <q>: (where <q> is a float) return the q quantile of the
            ensemble (where the quantile is a decimal, as in 0.5 for 50th
            percentile)

            e: return the whole ensemble prediction as a list

        Join the arguments with commas. For example, to return the median
        prediction plus a centered spread covering 90% of the ensemble
        prediction, use output=[.5, .05, .95].
        If the ensemble is requested, it must be the last argument, e.g.,
        output=[.5, .025, .97.5, 'e'].
        Note a list is typically returned, but if only one attribute is
        requested it returns it as a float, so that it's ASE-like.
        """
        energies = [calc.get_potential_energy(atoms) for calc in self.ensemble]
        if output[-1] == 'e':
            quantiles = output[:-1]
            return_ensemble = True
        else:
            quantiles = output
            return_ensemble = False
        for quantile in quantiles:
            if (quantile > 1.0) or (quantile < 0.0):
                raise RuntimeError('Quantiles must be between 0 and 1.')
        result = mquantiles(energies, prob=quantiles)
        result = list(result)
        if return_ensemble:
            result.append(energies)
        if len(result) == 1:
            result = result[0]
        return result