def plot_core_pos_uncertainty_vs_R(table):
    figure()

    x, y = table.col('reference_core_pos').T
    x2, y2 = table.col('reconstructed_core_pos').T
    d = sqrt((x - x2) ** 2 + (y - y2) ** 2)

    r = table.col('r')

    bins = linspace(0, 50, 41)
    x, d25, d50, d75 = [], [], [], []
    for low, high in zip(bins[:-1], bins[1:]):
        sel = d.compress((low <= r) & (r < high))

        if len(sel) > 0:
            x.append((low + high) / 2)
            d25.append(scoreatpercentile(sel, 25))
            d50.append(scoreatpercentile(sel, 50))
            d75.append(scoreatpercentile(sel, 75))

    fill_between(x, d25, d75, color='0.75')
    plot(x, d50, 'o-', color='black')

    xlabel("Core distance [m]")
    ylabel("Core position uncertainty [m]")
    utils.saveplot()
def plot_percentiles(data, numbins, xlim, ylim, vert = True, color = 'k', linestyle = 'solid', linew = 2):
    perc = 1. / numbins 
    for i in range(1, numbins):
        if vert:
            plt.vlines(sts.scoreatpercentile(data, i * perc * 100.), ylim[0], ylim[1], color, linestyle, linewidth = linew)
        else:
            plt.hlines(sts.scoreatpercentile(data, i * perc * 100.), xlim[0], xlim[1], color, linestyle, linewidth = linew)
Esempio n. 3
0
def lookatresults(data, modes, theta=None, vert=False, labels=None):


    P = data[-1][0]
    n = P.shape[0]

    if labels == None:
        labels = [""] * n
    else:
        pass 

    if vert == True:
        subplots = range(n*100+11,n*100+n+11,1)
        figsize = (6, 3*n)
    elif vert == 'four':
        subplots = [221, 222, 223, 224]
        figsize = (10, 10)
    else:
        subplots = range(100+n*10+1,100+n*10+1+n,1)
        figsize = (5*n, 3)

    f = stats.gaussian_kde(data[-1][0])
    int_guess = np.mean(data[-1][0], axis=1)
    modes = minimize(neg, int_guess, args=(f)).x

    thetas = []
    P = data[-1][0]
    labelpad = 20

    for i in xrange(n):
        x = P[i]
        t = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format(
            modes[i]-stats.scoreatpercentile(x, 16),
            modes[i],
            stats.scoreatpercentile(x, 84)-modes[i], i+1)

        thetas.append(t)

    if P.shape[1] > 10:
        bins = np.sqrt(P.shape[1])
    else:
        bins=10
    fig = plt.figure(figsize=figsize)
    
    for i in xrange(n):
        print subplots[i]
        plt.subplot(int(subplots[i]))
        #plt.title(thetas[0])
        ker = stats.gaussian_kde(P[i])
        h = plt.hist(P[i], bins=bins, normed=True, alpha=1)
        x = np.linspace(h[1][0],h[1][-1],1000)
        plt.plot(x,ker(x))
        plt.xlabel(labels[i], labelpad=labelpad, fontsize=24)
        if theta != None:
            plt.axvline(theta[0])

    for t in thetas: 
        print t

    return fig
Esempio n. 4
0
 def __init__(self, a):
     self.min = a.min()
     self.q1 = stats.scoreatpercentile(a, 25)
     self.median = numpy.median(a)
     self.mean = a.mean()
     self.q3 = stats.scoreatpercentile(a, 75)
     self.max = a.max()
def plot_uncertainty_core_distance(table):
    N = 2
    THETA = deg2rad(22.5)
    DTHETA = deg2rad(5.)
    DN = .5
    DR = 10
    LOGENERGY = 15
    DLOGENERGY = .5

    figure()
    x, y, y2 = [], [], []
    for R in range(0, 81, 20):
        x.append(R)
        events = table.read_where('(abs(min_n134 - N) <= DN) & (abs(reference_theta - THETA) <= DTHETA) & (abs(r - R) <= DR) & (abs(log10(k_energy) - LOGENERGY) <= DLOGENERGY)')
        print(len(events),)
        errors = events['reference_theta'] - events['reconstructed_theta']
        # Make sure -pi < errors < pi
        errors = (errors + pi) % (2 * pi) - pi
        errors2 = events['reference_phi'] - events['reconstructed_phi']
        # Make sure -pi < errors2 < pi
        errors2 = (errors2 + pi) % (2 * pi) - pi
        #y.append(std(errors))
        #y2.append(std(errors2))
        y.append((scoreatpercentile(errors, 83) - scoreatpercentile(errors, 17)) / 2)
        y2.append((scoreatpercentile(errors2, 83) - scoreatpercentile(errors2, 17)) / 2)

    print()
    print("R: theta_std, phi_std")
    for u, v, w in zip(x, y, y2):
        print(u, v, w)
    print()

#    # Simulation data
    sx, sy, sy2 = loadtxt(os.path.join(DATADIR, 'DIR-plot_uncertainty_core_distance.txt'))

    graph = GraphArtist()

    # Plots
    plot(x, rad2deg(y), '^-', label="Theta")
    graph.plot(x[:-1], rad2deg(y[:-1]), mark='o')
    plot(sx, rad2deg(sy), '^-', label="Theta (sim)")
    graph.plot(sx[:-1], rad2deg(sy[:-1]), mark='square')
    plot(x, rad2deg(y2), 'v-', label="Phi")
    graph.plot(x[:-1], rad2deg(y2[:-1]), mark='*')
    plot(sx, rad2deg(sy2), 'v-', label="Phi (sim)")
    graph.plot(sx[:-1], rad2deg(sy2[:-1]), mark='square*')

    # Labels etc.
    xlabel("Core distance [m] $\pm %d$" % DR)
    graph.set_xlabel(r"Core distance [\si{\meter}] $\pm \SI{%d}{\meter}$" % DR)
    ylabel("Angle reconstruction uncertainty [deg]")
    graph.set_ylabel(r"Angle reconstruction uncertainty [\si{\degree}]")
    title(r"$N_{MIP} = %d \pm %.1f, \theta = 22.5^\circ \pm %d^\circ, %.1f \leq \log(E) \leq %.1f$" % (N, DN, rad2deg(DTHETA), LOGENERGY - DLOGENERGY, LOGENERGY + DLOGENERGY))
    ylim(ymin=0)
    graph.set_ylimits(min=0)
    xlim(-2, 62)
    legend(numpoints=1, loc='best')
    utils.saveplot()
    artist.utils.save_graph(graph, dirname='plots')
    print
    def freedmanDiaconisRule(self,data):
        """
        Calculate number of bins to use in histogram according to this rule.

        Parameters:
        data    -    a numpy.ndarray containing the data for which a histogram is to be computed.

        Returns:

        The 'optimal' number of bins for the histogram.
        """
        # interquartile range, Q3-Q1....
        iqr = stats.scoreatpercentile(data, 75) - stats.scoreatpercentile(data, 25)
        binwidth = 2 * iqr * pow(len(data), -0.3333333)

        if(binwidth<=0):
            binwidth=60

        # calculate n bins
        rnge = max(data) - min(data)
        nbins = ceil( rnge / binwidth )

        if(self.verbose):
            print "\t\tFreedman Diaconis Rule values for bins:"
            print "\t\t\tIQR: ",iqr
            print "\t\t\tBin Width: ",binwidth
            print "\t\t\tRange: ",rnge
            print "\t\t\tNumber of bins: ", nbins

        return int(nbins)
Esempio n. 7
0
 def get_isize_stats(self, limit=1e5): 
     """Estimate insert size median, mean and stdev.
     Also count pair orientations and select main.
     """
     if self.log:
         self.log.write("Estimating insert size stats...\n")
     isizes = []
     self.pairs = [0, 0, 0, 0]
     for alg in pysam.Samfile(self.bam):
         #take only reads with good alg quality and one read per pair
         if alg.mapq < self.mapq or alg.isize < 1:
             continue
         #store isize
         isizes.append(alg.isize)
         #store pair orientation
         self.pairs[self.alg2orientation(alg)] += 1
         #stop if limit reached
         if len(isizes) >= limit:
             break
     #get rid of right 5 percentile
     maxins = stats.scoreatpercentile(isizes, 100-self.q)
     minins = stats.scoreatpercentile(isizes, self.q)
     isizes = filter(lambda x: minins<x<maxins, isizes)
     #store
     self.isize_median = np.median(isizes)
     self.isize_mean   = np.mean(isizes)
     self.isize_stdev  = np.std(isizes)
Esempio n. 8
0
def _filter_ridge_lines(cwt, ridge_lines, window_size=None, min_length=None,
                       min_snr=1, noise_perc=10):

    num_points = cwt.shape[1]
    if min_length is None:
        min_length = n.ceil(cwt.shape[0] / 4)
    if window_size is None:
        window_size = n.ceil(num_points / 20)
    hf_window = window_size / 2

    #Filter based on SNR
    row_one = cwt[0, :]
    noises = n.zeros_like(row_one)
    for ind, val in enumerate(row_one):
        window = n.arange(max([ind - hf_window, 0]), min([ind + hf_window, num_points]))
        window = window.astype(int)
        noises[ind] = scoreatpercentile(row_one[window], per=noise_perc)
        #noises[ind] = n.std(row_one[window])
    noise_level = scoreatpercentile(row_one, per = noise_perc)

    def filt_func(line):
        if len(line[0]) < min_length:
            return False
        #snr = abs(cwt[line[0][0], line[1][0]] / noises[line[1][0]])
        c=line[0][-1]/2
        #snr = -cwt[c, line[1][0]] / noises[line[1][0]]
        #snr = cwt[c, line[1][0]] / abs(noises[line[1][0]]) + 1
        snr = cwt[c, line[1][0]] / abs(noise_level) + 1
        #line.append(['snr=',snr,c,cwt[c, line[1][0]] , noises[line[1][0]]] )
        line.append(['snr=',snr,c,cwt[c, line[1][0]] , noise_level] )
        if snr < min_snr:
            return False
        return True

    return list(filter(filt_func, ridge_lines))
Esempio n. 9
0
def bin_and_plot_data(pop, num_samples):
	max_file = './maxfiles/random/' + pop + '.' + str(num_samples) + '.' + str(ind) + '.max'
	composite_max_file  = './maxfiles/random/' + pop + '.' + str(num_samples) + '.' + 'composite' + '.max'
	#make_quantiles(match_file, quant_list)

	data = read_data(composite_max_file)


	my_bins = make_bins()
	
	start =   time.clock() 
	counter_2, weighted_counter = bin_data_2(composite_max_file, my_bins)
	print 'my sum is', sum(counter_2)
	end =  time.clock()

	print  'The SORTED binning process took', end - start, 'seconds.'


	#(n, bins, patches) = P.hist(data, bins = 100, normed = True, cumulative = False, alpha = .5, label = pop)


	#center = (bins[:-1] + bins[1:]) / 2
	plt.plot(my_bins, counter_2, label = pop)
	#plt.plot(center,n)
	P.xlim([0,1e6])
	#for i in range(0, len(my_bins)):
	#  print my_bins[i], counter_2[i]
	print pop, len(data)
	print min(data)
	print stats.scoreatpercentile(data, .5)
	print max(data)
Esempio n. 10
0
File: c5.py Progetto: 3774257/abu
def sample_571_2():
    """
    5.7.1 黄金分割线的定义方式
    :return:
    """
    from collections import namedtuple

    # 收盘价格序列中的最大值
    cs_max = tsla_df.close.max()
    # 收盘价格序列中的最小值
    cs_min = tsla_df.close.min()

    sp382 = (cs_max - cs_min) * 0.382 + cs_min
    sp618 = (cs_max - cs_min) * 0.618 + cs_min
    sp382_stats = stats.scoreatpercentile(tsla_df.close, 38.2)
    sp618_stats = stats.scoreatpercentile(tsla_df.close, 61.8)

    def plot_golden():
        # 从视觉618和统计618中筛选更大的值
        above618 = np.maximum(sp618, sp618_stats)
        # 从视觉618和统计618中筛选更小的值
        below618 = np.minimum(sp618, sp618_stats)
        # 从视觉382和统计382中筛选更大的值
        above382 = np.maximum(sp382, sp382_stats)
        # 从视觉382和统计382中筛选更小的值
        below382 = np.minimum(sp382, sp382_stats)

        # 绘制收盘价
        plt.plot(tsla_df.close)
        # 水平线视觉382
        plt.axhline(sp382, c='r')
        # 水平线统计382
        plt.axhline(sp382_stats, c='m')
        # 水平线视觉618
        plt.axhline(sp618, c='g')
        # 水平线统计618
        plt.axhline(sp618_stats, c='k')

        # 填充618 red
        plt.fill_between(tsla_df.index, above618, below618,
                         alpha=0.5, color="r")
        # 填充382 green
        plt.fill_between(tsla_df.index, above382, below382,
                         alpha=0.5, color="g")

        # 最后使用namedtuple包装上,方便获取
        return namedtuple('golden', ['above618', 'below618', 'above382',
                                     'below382'])(
            above618, below618, above382, below382)

    golden = plot_golden()

    # 根据绘制顺序标注名称
    plt.legend(['close', 'sp382', 'sp382_stats', 'sp618', 'sp618_stats'],
               loc='best')
    plt.show()

    print('理论上的最高盈利: {}'.format(golden.above618 - golden.below382))

    return golden
Esempio n. 11
0
def generate_scipy_comparison(csvPathname):
    # this is some hack code for reading the csv and doing some percentile stuff in scipy
    from numpy import loadtxt, genfromtxt, savetxt

    dataset = loadtxt(
        open(csvPathname, 'r'),
        delimiter=',',
        dtype='int16');

    print "csv read for training, done"

    # we're going to strip just the last column for percentile work
    # used below
    NUMCLASSES = 10
    print "csv read for training, done"

    # data is last column
    # drop the output
    print dataset.shape
    if 1==0:
        n_features = len(dataset[0]) - 1;
        print "n_features:", n_features

        # get the end
        target = [x[-1] for x in dataset]

        print "histogram of target"
        print sp.histogram(target,bins=NUMCLASSES)

        print target[0]
        print target[1]

    from scipy import stats
    stats.scoreatpercentile(dataset, [10,20,30,40,50,60,70,80,90])
Esempio n. 12
0
def generatePercentile(columns, theTimeFilter, trafficFilter, centricity, dataRes):
    print "Running timeseries report"
    report = TrafficOverallTimeSeriesReport(profiler)

    report.run( columns = columns,
                timefilter = theTimeFilter,
                trafficexpr = TrafficFilter(trafficFilter),
                centricity = centricity,
                resolution = dataRes
                )

    data = report.get_data()
    report.delete()

    print "Getting data"
    dataList = fixBucket(data, int(sumTime))
    
    if outputData == True and clean == False:
        print "Data points:"
        for x in dataList[1]:
            print x[0]

    if graph == True:
        genGraph(data, dataList, percentileVal)

    #print '{}% Average Bytes is {}'.format(percentileVal, stats.scoreatpercentile(data, percentileVal)[0])
    if clean == False: print '{}% Average Bytes is {}'.format(percentileVal, stats.scoreatpercentile(dataList, percentileVal)[0])
    if Max == True and clean == False: print 'Maximum Average Bytes is {}'.format(max(data)[0])
    if Min == True and clean == False: print 'Minimum Average Bytes is {}'.format(min(data)[0])
    if Median == True and clean == False: print 'Median Average Bytes is {}'.format(stats.scoreatpercentile(data, 50)[0])
    if clean == True: print '{} {}'.format(trafficFilter, stats.scoreatpercentile(dataList[0], percentileVal)[0])
Esempio n. 13
0
def plot_histogram(trans_w, trans_g, index, nComps=0, doCombinedFitting=True):
    import scipy.stats as sps

    nBins = 100
    if isinstance(index, str):
        ww = trans_w[index]
        gg = trans_g[index]
    else:
        ww = trans_w[:, index]
        gg = trans_g[:, index]
#    low  = min(np.min(gg), np.min(ww))
#    high = max(np.max(gg), np.max(ww))
    low  = min(sps.scoreatpercentile(gg,  1), sps.scoreatpercentile(ww,  1))
    high = max(sps.scoreatpercentile(gg, 99), sps.scoreatpercentile(ww, 99))

    h_w, bin_edges = np.histogram(ww, nBins, (low, high))
    h_g, bin_edges = np.histogram(gg, nBins, (low, high))
    bin_centers = (bin_edges[:-1] + bin_edges[1:])/2

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ebkw = {'linewidth':1,}
    ax.errorbar(bin_centers, h_w, np.sqrt(h_w),label=w_label   ,color='g', **ebkw)
    ax.errorbar(bin_centers, h_g, np.sqrt(h_g),label=g_label   ,color='b', **ebkw)
    if type(index) == str:
        ax.set_xlabel(index, size='x-large')
    else:
        ax.set_xlabel(makeTitle(index, nComps, doCombinedFitting), size='x-large')
    ax.set_ylabel('Occupancy', size='x-large')
    ax.set_ylim(0,ax.get_ylim()[1])
    ax.grid()
    plt.legend()
Esempio n. 14
0
def just_plot_rc(rc, rc_controls, ax=None, x=None, top_limit=None, **kwargs):
    from numpy import array, mean
    from numpy.ma import masked_invalid
    
    if x is None:
        from numpy import arange
        x = arange(len(rc))
    
    if ax is None:
        ax = gca()
            
    from scipy.stats import scoreatpercentile
    upper95 = array([scoreatpercentile(rc_controls[:,i], 97.5) for i in range(len(x))])
    lower95 = array([scoreatpercentile(rc_controls[:,i], 2.5) for i in range(len(x))])
    
    control_mean = mean(rc_controls,axis=0)
    rc_norm = rc/control_mean
    rc_norm = masked_invalid(rc_norm)

    ax.plot(x, rc_norm, linewidth=1, color='k')
    ax.plot(ax.get_xlim(), (1,1), linestyle='--', color='k')
    ax.set_ylim(bottom=0)
    if ax.get_ylim()[1]<2:
        ax.set_ylim(top=2)
    if top_limit:
        ax.set_ylim(top=top_limit)
    ax.set_ylabel(r"$RC_{norm}$")#+"\n (normalized rich club coefficient)")
    ax.fill_between(x, 1,rc_norm, where=(rc>upper95) | (rc<lower95), **kwargs)
    
    return ax
Esempio n. 15
0
def trend_bins(x, y, xlow=None, xhigh=None, xbinwidth=None, nmin=100,
               lowpc=2.5, highpc=97.5, usemedian=True):
    if xlow is None:  xlow = scoreatpercentile(x, 1)
    if xhigh is None:  xhigh = scoreatpercentile(x, 99)
    if xbinwidth is None:  xbinwidth = (xhigh - xlow) * 100*nmin / len(x)
    x_bin = N.arange(xlow, xhigh+xbinwidth/2.0, xbinwidth)
    n_bin = len(x_bin)
    xx_bin = N.zeros(n_bin, N.float) - 99999
    y_bin = N.zeros((3, n_bin), N.float) - 99999
    ok = N.ones(n_bin, N.bool)
    for i, xb in enumerate(x_bin):
        inbin = (x >= xb - 0.5*xbinwidth) & (x < xb + 0.5*xbinwidth)
        x_inbin = x[inbin]
        y_inbin = y[inbin]
        if len(y_inbin) > nmin:
            xx_bin[i] = median(x_inbin)
            if usemedian:
                y_bin[0, i] = median(y_inbin)
            else:
                y_bin[0, i] = N.mean(y_inbin)
            y_bin[1, i] = scoreatpercentile(y_inbin, lowpc)
            y_bin[2, i] = scoreatpercentile(y_inbin, highpc)
        else:
            ok[i] = False
    #x_bin = x_bin[ok]
    xx_bin = xx_bin[ok]
    y_bin = y_bin[:,ok]
    return xx_bin, y_bin
Esempio n. 16
0
def calc_quartiles(data):
    """returns q1, q3, and iqr (interquartile range) of
    data (note data needs to be sorted)"""
    q1 = scoreatpercentile(data, 25)
    q3 = scoreatpercentile(data, 75)
    iqr = q3 - q1
    return q1, q3, iqr
Esempio n. 17
0
def confidence_interval_1d(A, alpha=SIGMA1, metric=np.mean,
                           numResamples=10000, interpolate=True):
    """Calculate confidence interval along one dimensional array."""
    if not isinstance(alpha, collections.Iterable):
        alpha = np.array([alpha])

    N = len(A)
    resampleInds = np.random.randint(0, N, (numResamples, N))
    metricOfResampled = metric(A[resampleInds], axis=-1)
    confidenceInterval = np.zeros(2*len(alpha), dtype='float')

    if interpolate:
        for thisAlphaInd, thisAlpha in enumerate(alpha):
            percenPos = (thisAlpha * 100 / 2.0)
            samplePos = scoreatpercentile(metricOfResampled, percenPos)
            confidenceInterval[2*thisAlphaInd] = samplePos
            percenNeg = (100 - thisAlpha * 100 / 2.0)
            sampleNeg = scoreatpercentile(metricOfResampled, percenNeg)
            confidenceInterval[2*thisAlphaInd+1] = sampleNeg
    else:
        sortedMetricOfResampled = np.sort(metricOfResampled)
        for thisAlphaInd, thisAlpha in enumerate(alpha):
            percenPos = int(round(thisAlpha*numResamples / 2.0))
            samplePos = sortedMetricOfResampled[percenPos]
            confidenceInterval[2*thisAlphaInd] = samplePos
            percenNeg = int(round(numResamples -
                                  (thisAlpha * numResamples / 2.0)))
            sampleNeg = sortedMetricOfResampled[percenNeg]
            confidenceInterval[2*thisAlphaInd+1] = sampleNeg
    return confidenceInterval
Esempio n. 18
0
def main(datafile, feature1, feature2, bins, percentile, copula, logscale):
    X, features = read_sah_h5(datafile, just_good=False)
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]

    if percentile > 0 and not copula:
        bx = np.linspace(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile),
            bins)
        by = np.linspace(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile),
            bins)
        bins = (bx, by)

    if copula:
        x = copula_transform(x)
        y = copula_transform(y)

    if logscale:
        pl.hist2d(x, y, bins=bins, norm=LogNorm())
    else:
        pl.hist2d(x, y, bins=bins)
    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
Esempio n. 19
0
def freedman_diaconis(data):
    """
    Estimate an optimal number of histogram bins using the Freedman-Diaconis
    rule of thumb.
    
    Parameters
    ----------
    data : ndarray
        The data to histogram.
        
    Returns
    -------
    n_bins : int
        The number of bins to use.
    """
    
    data = data.flatten()
    
    q3 = stats.scoreatpercentile(data, 75.0)
    q1 = stats.scoreatpercentile(data, 25.0)
    
    h  = 2.0 * (q3 - q1) * np.power(len(data), -1.0/3.0)
    n_bins = int( ( np.max(data) - np.min(data) ) / h )
    
    return n_bins
def filterOutliers(times, irq_range=.5, rtn_range=False):
    # We need the original indexes for marking which were removed
    original_times = list(times)

    times.sort()
    np_times = array(times)
    q1 = scoreatpercentile(np_times, 25, interpolation_method='fraction')
    q3 = scoreatpercentile(np_times, 75, interpolation_method='fraction')
    irq = q3-q1
    slack = irq_range * irq

    okRange = (q1-slack, q3+slack)

    newTimes = []
    rm_indexes = []
    for index,time in enumerate(original_times):
        # If there is only one element in the list it may not be an outlier but
        # but it could still be erroneous
        if time > max_key_time:
            rm_indexes.append(index)
            continue

        if okRange[0] <= time <= okRange[1]:
            newTimes.append(time)
        else:
            rm_indexes.append(index)

    if rtn_range:
        return newTimes, rm_indexes, okRange
    else:
        return newTimes, rm_indexes
Esempio n. 21
0
def mk_image(galaxy):
    base = './../../images_v5/GS_2.5as_matched/gs_all_'

    i_img = pyf.getdata(base+str(galaxy)+'_I.fits')
    j_img = pyf.getdata(base+str(galaxy)+'_J.fits')
    h_img = pyf.getdata(base+str(galaxy)+'_H.fits')

    #include 90% of pixels
    x = pyl.hstack(i_img)
    i_lim = scoreatpercentile(x,99)
    x = pyl.hstack(j_img)
    j_lim = scoreatpercentile(x,99)
    x = pyl.hstack(h_img)
    h_lim = scoreatpercentile(x,99)

    print galaxy, i_lim, j_lim, h_lim

    img = pyl.zeros((h_img.shape[0], h_img.shape[1], 3), dtype=float)
    img[:,:,0] = img_scale.asinh(h_img, scale_min=-0.1*h_lim, scale_max=h_lim,
            non_linear=0.5)
    img[:,:,1] = img_scale.asinh(j_img, scale_min=-0.1*j_lim, scale_max=j_lim,
            non_linear=0.5)
    img[:,:,2] = img_scale.asinh(i_img, scale_min=-0.1*i_lim, scale_max=i_lim,
            non_linear=0.5)

    return img
Esempio n. 22
0
def average_bins(xdata, ydata, xmin, xmax, nxbins=15):
    """
    Computes mean and 16 and 84 percentiles of y-data in bins in x

    :param xdata: numpy array of xdata
    :param ydata: numpy arrya of ydata
    :param xmax: maximumx value of x that data are binned to
    :param xmin: minimum value of x that data are binned to
    :param nxbins: number of bins in x

    :return: mid points of the bins, mean, 16 per cent percentile, and 84 per cent percentile.
    """
    xbin = N.linspace(xmin, xmax, nxbins)
    nbin = len(xbin) - 1
    xbin_mid = N.zeros(nbin)
    y50 = N.zeros(nbin) - 99.0
    y16 = N.zeros(nbin) - 99.0
    y84 = N.zeros(nbin) - 99.0

    for i in range(nbin):
        xbin_mid[i] = xbin[i] + 0.5 * (xbin[i + 1] - xbin[i])
        mask = (xdata > xbin[i]) & (xdata <= xbin[i + 1])
        if len(ydata[mask]) >= 10:
            y50[i] = N.mean(ydata[mask])
            y16[i] = ss.scoreatpercentile(ydata[mask], 16)
            y84[i] = ss.scoreatpercentile(ydata[mask], 84)
    return xbin_mid, y50, y16, y84
Esempio n. 23
0
def find_offset(p1, p2):
    datarange = (p1['long'].min(), p1['long'].min(), p1['long'].max(), p1['lat'].min(), p1['lat'].max())    
    long_min = max(p1['long'].min(), p2['long'].min())
    long_max = min(p1['long'].max(), p2['long'].max())
    lat_min = max(p1['lat'].min(), p2['lat'].min())
    lat_max = min(p1['lat'].max(), p2['lat'].max())
    select = (p1['long'] >= long_min) & (p1['long'] <= long_max) & (p1['lat'] >= lat_min) & (p1['lat'] <= lat_max)
    p1 = p1[select]
    select = (p2['long'] >= long_min) & (p2['long'] <= long_max) & (p2['lat'] >= lat_min) & (p2['lat'] <= lat_max)
    p2 = p2[select]
    if len(p1) < 1 or len(p2) < 1:
        print('Too few craters to find offset')
        return [0.0, 0.0]
    if len(p1) > 100 and len(p2) > 100:
        big = scoreatpercentile(p1['radius'], 75)
        big = min(big, scoreatpercentile(p2['radius'], 75))
        p1 = p1[p1['radius'] > big]
        p2 = p2[p2['radius'] > big]
    minsize1 = numpy.zeros(p1.shape[0], [('minsize', numpy.double)])
    minsize2 = numpy.zeros(p2.shape[0], [('minsize', numpy.double)])
    X1 = numpy.asarray([p1[name] for name in ('long', 'lat', 'radius')]+[minsize1['minsize']], order='c', dtype=numpy.double)
    X2 = numpy.asarray([p2[name] for name in ('long', 'lat', 'radius')]+[minsize2['minsize']], order='c', dtype=numpy.double)
    results = fmin(comparedata, [0.0, 0.0], args=(X1, X2), xtol=0.001, maxiter=1000)
    results *= degrees_per_metre  # convert from rough metres to degrees
    print('Found a shift of dlong = %e deg, dlat = %e deg'%tuple(results))
    return results
def plot_xwt_wavetransf(power, time, wa, T, S, sig95, pangle, time_base, scalemin=0, scalemax=6, ylabel='Pressure (mb)', plot_percentile=False):
    """plotting WaveTransform Power with confidence interval contour and phase vectors"""

    fig = plt.figure(10)
    ax = plt.subplot(1,1,1)
    if plot_percentile:
        #use following to contour at "percentiles variances" when using non-normalized data to match web output
        csf =plt.contourf(T, S, power, levels=[ 0, stats.scoreatpercentile(power, 25), stats.scoreatpercentile(power, 50),
                                           stats.scoreatpercentile(power, 75), stats.scoreatpercentile(power, 95), 
                                           stats.scoreatpercentile(power, 100)], colors=bmap)
    else:
        #use following to contour at "normalized variances" BAMS
        csf =plt.contourf(T, S, power, levels=[ 0, .2,.4,.6,.8,1], colors=bmap)
    cbar = plt.colorbar(pad=.1, shrink=.5, format='%.4f', extend='both') #move and shrink colorbar
    levels = [-99, 1] # values greater than 1 are significant
    plt.contour(T, S, sig95,levels, colors='black', linewidths=1)
    ax.set_yscale('log')
    ax.grid(True)
    
    # plot phase relationship 
    arr_dens = [60, 30]
    arr_densx = np.round( len(time) / arr_dens[0] )
    arr_densy = np.round( len(wa.scales) / arr_dens[1] )
    if arr_dens == 0:
        arr_dens = 1
        
    plt.quiver(T[::arr_densy,::arr_densx],S[::arr_densy,::arr_densx],(np.cos(pangle))[::arr_densy,::arr_densx],(np.sin(pangle))[::arr_densy,::arr_densx],\
    width=.00125, headwidth=4, headlength=4, alpha=0.6, color='k')

    # put the ticks at powers of 2 in the scale
    ticks = np.unique(2 ** np.floor(np.log2(wa.scales)))[1:]
    ax.yaxis.set_ticks(ticks)
    ax.yaxis.set_ticklabels(ticks.astype(str))
    ax.set_ylim(scalemax, scalemin)
    ax.set_ylabel('scales')

    # second y scale with equivalent fourier periods to scales
    # except with the ticks at the powers of 2
    ax_fourier = ax.twinx()
    ax_fourier.set_yscale('log')
    # match the fourier ticks to the scale ticks
    ax_fourier.set_yticks(ticks)
    ax_fourier.set_yticklabels(ticks.astype(str))
    ax_fourier.set_ylabel('fourier period (%s)' % time_base )
    fourier_lim = [wa.fourier_period(i) for i in ax.get_ylim()]
    ax_fourier.set_ylim(fourier_lim)

    ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d'))
    fig.autofmt_xdate()

    # shade the region between the edge and coi
    C, S = wa.coi
    ax.fill_between(x=C, y1=S, y2=wa.scales.max(), color='gray', alpha=0.5)
    ax.set_xlim(time.min(), time.max())
        
    #plt.show()
    DefaultSize = fig.get_size_inches()
    fig.set_size_inches( (DefaultSize[0]*2, DefaultSize[1]) )
    
    return (plt, fig)
Esempio n. 25
0
def shell_move(inAtom,atomIndex):
  #  we're going to be changing the position of atomIndex inside inAtom
  #  make sure that you remove any crazy outliers before you do this
  #  or else it'll just make a bunch more outliers, which is a poor idea
  #  make sure atomIndex comes from range(len(inAtom.get_positions())) so we don't get out of bounds
  try:
    inCOM = inAtom.get_center_of_mass()
    inDistances = distanceCenter(inAtom)
    ninetyNinthRadius = stats.scoreatpercentile(inDistances,99)
    ninetyFifthRadius = stats.scoreatpercentile(inDistances,95)
    outerFourRadius = ninetyNinthRadius - ninetyFifthRadius

    randomNewRadius = random.normal( (ninetyNinthRadius+ninetyFifthRadius)/2 , (ninetyNinthRadius - ninetyFifthRadius)/2 )
    xFromCenter = random.uniform(0,randomNewRadius)
    randomNewRadius = ((randomNewRadius**2) - (xFromCenter**2))**0.5
    yFromCenter = random.uniform(0,randomNewRadius)
    zFromCenter = ((randomNewRadius**2) - (yFromCenter**2))**0.5

    newXPosition = inCOM[0] + plusOrMinus()*xFromCenter
    newYPosition = inCOM[1] + plusOrMinus()*yFromCenter
    newZPosition = inCOM[2] + plusOrMinus()*zFromCenter

    positionArray = inAtom.get_positions()
    positionArray[atomIndex] = (newXPosition,newYPosition,newZPosition)
    inAtom.set_positions(positionArray)

    return inAtom

  except IndexError:
    print "The index of the atom you wanted to move is too high or too low."
    print "Please check your function call of shell_move(a,b)"
    print "-Jeff"
def fitnesscost_confidence(region, data, ax=None, fname=None):
    '''
    bootstrap the fitness cost estimates and make distributions of the bootstrapped
    values for subsets of sites with a defined median. this should give an impression
    of how variable the estimates are. three such distributions are combined in one
    figure
    '''
    from util import add_panel_label

    # generate boo strap estimates of minor SNP frequences
    av = process_average_allele_frequencies(data, [region],
                    nbootstraps=100, bootstrap_type='bootstrap')
    combined_af = av['combined_af']
    combined_entropy = av['combined_entropy']
    minor_af = av['minor_af']
    combined_entropy_bs = av['combined_entropy_bs']
    minor_af_bs = av['minor_af_bs']

    # convert minor_af to 100x(length of gene) array of minor SNPs
    minor_af_array=np.array(minor_af_bs[region])
    qtiles = np.vstack([scoreatpercentile(minor_af_array, x, axis=0) for x in [25, 50, 75]])
    # calculate selection coefficient quantiles corresponding to SNP_freq quantiles
    scb = (data['mut_rate'][region]/(af_cutoff+qtiles)).T
    sel_coeff_array = (data['mut_rate'][region]/(af_cutoff+minor_af_array))
    sel_coeff_array[sel_coeff_array<0.001]=0.001
    sel_coeff_array[sel_coeff_array>0.1]=0.1
    which_quantile = np.zeros(minor_af_array.shape[1], dtype=int)
    thres = [20,40,60]
    for i,ql in enumerate(thres):
        # take sites if slice [ql,ql+2]
        sl,su=scoreatpercentile(scb[:,1], ql), scoreatpercentile(scb[:,1], ql+2)
        which_quantile[(scb[:,1]>=sl)&(scb[:,1]<su)]=i+1

    scb[scb>0.1]=0.1
    scb[scb<0.001]=0.001
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(8,6))
    for i in range(1,len(thres)+1):
        try:
            ind = (which_quantile==i)&(~np.any(np.isnan(sel_coeff_array),axis=0))
            npoints = ind.sum()*sel_coeff_array.shape[0]
            ax.plot(np.median(scb[ind,1])*np.ones(2), [0,0.5], c=cols[i+3], lw=4)
            ax.hist(sel_coeff_array[:,ind].flatten(), weights=np.ones(npoints,dtype=float)/npoints,
                    bins=np.logspace(-3, -1, 21),alpha=0.7, color=cols[i+3])
        except:
            import ipdb; ipdb.set_trace()
    ax.set_xscale('log')
    ax.set_xlabel('fitness cost', fontsize=fs)
    ax.set_ylabel('normalized counts', fontsize=fs)
    ax.tick_params(labelsize=fs*0.8)
    region_panels = {'gag': 'A', 'pol': 'B', 'env': 'E', 'nef': 'F', 'vif': 'C',
                     'vpu': 'E', 'vpr': 'G'}
    ax.text(0.1, 0.9, region,
            transform=ax.transAxes,
            fontsize=fs*1.5)

    if fname is not None:
        plt.tight_layout()
        for ext in ['png', 'svg', 'pdf']:
            plt.savefig(fname+'.'+ext)
def identify_at_risk(graph):
	'''
		We hypothesize that an actor is at risk for beginning to use drugs if:
			1. He weighs social influence more heavily than other actors. (His alpha [social susceptibility])
			   is in the upper quartile.)
			2. He receives more highly influential inputs than other actors. (His influence kernel is in the upper quartile with 
				resepct to the number of influencers he recieves where those influencers are the strongest in the network)
			3. Those inputs he weighs highly are from people who consume drugs frequently and have a positive attitude towards
			    the consumption of drugs 
	'''
	all_kernel_values = np.concatenate(influence_kernel.values())

	upper_quartile_influence_kernel = scoreatpercentile(all_kernel_values, 75)
	
	#Calculate distribution of fraction of upper_quartile_influencers per user
	fraction_of_influencers_per_user = [(influence_kernel[agent]>upper_quartile_influence_kernel).sum()/float(len(influence_kernel[agent]))
			for agent in graph.nodes()]

	#Identify those whoe recieve more influencers than other people
	upper_quartile_receiving_influence = scoreatpercentile(fraction_of_influencers_per_user,75)

	#At risk if recent uptick in drinking
	distribution_of_increases_in_drinking = np.diff(drinking_behavior[np.nonzero(drinking_behavior)], axis=1).ravel()
	threshold_for_concerning_drinking = scoreatpercentile(distribution_of_increases_in_drinking,75)
	print threshold_for_concerning_drinking

	upper_quartile_alpha = scoreatpercentile(alpha,75)
	at_risk = [agent for agent in graph.nodes() 
				if alpha[agent]>=upper_quartile_alpha and fraction_of_influencers(influence_kernel[agent],upper_quartile_influence_kernel)>upper_quartile_receiving_influence]
	return at_risk
Esempio n. 28
0
def percentile_bins(xdata, ydata, xmin, xmax, nxbins=15, log=False, limit=6):
    """
    Computes median and 16 and 84 percentiles of y-data in bins in x.

    :param xdata: numpy array of xdata
    :param ydata: numpy arrya of ydata
    :param xmax: maximum value of x that data are binned to
    :param xmin: minimum value of x that data are binned to
    :param nxbins: number of bins in x
    :param log: if True, xbins are logarithmically spaced, else linearly
    :param limit: the minimum number of values in a bin for which the
                  median and percentiles are returned for.

    :return: mid points of the bins, median, 16 per cent percentile, and 84 per cent percentile.
    """
    if log:
        xbin = N.logspace(xmin, xmax, nxbins)
    else:
        xbin = N.linspace(xmin, xmax, nxbins)
    nbin = len(xbin) - 1
    xbin_mid = N.zeros(nbin)
    y50 = N.zeros(nbin) - 99.0
    y16 = N.zeros(nbin) - 99.0
    y84 = N.zeros(nbin) - 99.0

    for i in range(nbin):
        xbin_mid[i] = xbin[i] + 0.5 * (xbin[i + 1] - xbin[i])
        mask = (xdata > xbin[i]) & (xdata <= xbin[i + 1])
        if len(ydata[mask]) >= limit:
            y50[i] = ss.scoreatpercentile(ydata[mask], 50)
            y16[i] = ss.scoreatpercentile(ydata[mask], 16)
            y84[i] = ss.scoreatpercentile(ydata[mask], 84)
    return xbin_mid, y50, y16, y84
Esempio n. 29
0
def coverage_plot(ax, x, data, color="red", percs=[50,90]):
    """
    ax = matplotlib axes instance
    x = x-axis coordinates
    data = profile data
    color = color in any way matplotlib accepts
    """
    
    # Might change this into an argument for the function
    percs = [(100 - float(p)) / 2 for p in percs[::-1]]
    alphas = [0.1, 0.4]

    # Convert to numpy array
    vals = array(data)

    # Get the median
    m = median(vals, axis=0)

    # Draw the minimum percentiles
    lines = [array([scoreatpercentile(vals[:,i], perc) for i in range(len(vals[0]))]) for perc in percs] + [m]
    for (line_min, line_max), alpha in zip([(lines[i], lines[i + 1]) for i in range(len(percs))], alphas):
        ax.fill_between(x, line_min, line_max, facecolor=color, alpha=alpha, edgecolor='face')    

    # Draw the maximum percentiles
    lines = [m] + [array([scoreatpercentile(vals[:,i], 100 - perc) for i in range(len(vals[0]))]) for perc in percs[::-1]] 
    for (line_min, line_max), alpha in zip([(lines[i], lines[i + 1]) for i in range(len(percs))], alphas[::-1]):

        ax.fill_between(x, line_min, line_max, facecolor=color, alpha=alpha, edgecolor='face')    
        
        # Draw the median
        ax.plot(x, m, color="black", alpha=0.95, linewidth=0.8)
Esempio n. 30
0
def bootstrap_stat(arr, stat=np.mean, n_iters=1000, alpha=0.05):
    """
    Produce a boot-strap distribution of the mean of an array on axis 0

    Parameters
    ---------
    arr : ndarray
       The array with data to be bootstrapped

    stat : callable
        The statistical function to call. will be called as `stat(arr, 0)`, so
        needs to accept that call signature.

    n_iters : int
        The number of bootstrap iterations to sample

    alpha : float
       The confidence interval size will be 1-alpha

    """
    stat_orig = stat(arr, 0)

    boot_arr = np.empty((arr.shape[-1] , n_iters))
    for ii in xrange(n_iters):
        this_arr=arr[np.random.random_integers(0, arr.shape[0]-1, arr.shape[0])]
        boot_arr[:, ii] = stat(this_arr, 0)

    eb = np.array([stats.scoreatpercentile(boot_arr[xx], 1-(alpha/2)) -
                   stats.scoreatpercentile(boot_arr[xx], alpha/2)
                   for xx in range(boot_arr.shape[0])])

    return stat_orig, eb
Esempio n. 31
0
    def add_graph(self,
                  adjacency_matrix,
                  node_coords,
                  node_color='auto',
                  node_size=50,
                  edge_cmap=cm.bwr,
                  edge_threshold=None,
                  edge_kwargs=None,
                  node_kwargs=None):
        """Plot undirected graph on each of the axes

            Parameters
            ----------
            adjacency_matrix: numpy array of shape (n, n)
                represents the edges strengths of the graph. Assumed to be
                a symmetric matrix.
            node_coords: numpy array_like of shape (n, 3)
                3d coordinates of the graph nodes in world space.
            node_color: color or sequence of colors
                color(s) of the nodes.
            node_size: scalar or array_like
                size(s) of the nodes in points^2.
            edge_cmap: colormap
                colormap used for representing the strength of the edges.
            edge_threshold: str or number
                If it is a number only the edges with a value greater than
                edge_threshold will be shown.
                If it is a string it must finish with a percent sign,
                e.g. "25.3%", and only the edges with a abs(value) above
                the given percentile will be shown.
            edge_kwargs: dict
                will be passed as kwargs for each edge matlotlib Line2D.
            node_kwargs: dict
                will be passed as kwargs to the plt.scatter call that plots all
                the nodes in one go.

        """
        # set defaults
        if edge_kwargs is None:
            edge_kwargs = {}
        if node_kwargs is None:
            node_kwargs = {}
        if node_color == 'auto':
            nb_nodes = len(node_coords)
            node_color = mpl_cm.Set2(np.linspace(0, 1, nb_nodes))

        node_coords = np.asarray(node_coords)

        # decompress input matrix if sparse
        if sparse.issparse(adjacency_matrix):
            adjacency_matrix = adjacency_matrix.toarray()

        # make the lines below well-behaved
        adjacency_matrix = np.nan_to_num(adjacency_matrix)

        # safety checks
        if 's' in node_kwargs:
            raise ValueError("Please use 'node_size' and not 'node_kwargs' "
                             "to specify node sizes")
        if 'c' in node_kwargs:
            raise ValueError("Please use 'node_color' and not 'node_kwargs' "
                             "to specify node colors")

        adjacency_matrix_shape = adjacency_matrix.shape
        if (len(adjacency_matrix_shape) != 2
                or adjacency_matrix_shape[0] != adjacency_matrix_shape[1]):
            raise ValueError(
                "'adjacency_matrix' is supposed to have shape (n, n)."
                ' Its shape was {0}'.format(adjacency_matrix_shape))

        node_coords_shape = node_coords.shape
        if len(node_coords_shape) != 2 or node_coords_shape[1] != 3:
            message = (
                "Invalid shape for 'node_coords'. You passed an "
                "'adjacency_matrix' of shape {0} therefore "
                "'node_coords' should be a array with shape ({0[0]}, 3) "
                'while its shape was {1}').format(adjacency_matrix_shape,
                                                  node_coords_shape)

            raise ValueError(message)

        if node_coords_shape[0] != adjacency_matrix_shape[0]:
            raise ValueError(
                "Shape mismatch between 'adjacency_matrix' "
                "and 'node_coords'"
                "'adjacency_matrix' shape is {0}, 'node_coords' shape is {1}".
                format(adjacency_matrix_shape, node_coords_shape))

        if not np.allclose(adjacency_matrix, adjacency_matrix.T, rtol=1e-3):
            raise ValueError("'adjacency_matrix' should be symmetric")

        # For a masked array, masked values are replaced with zeros
        if hasattr(adjacency_matrix, 'mask'):
            if not (adjacency_matrix.mask == adjacency_matrix.mask.T).all():
                raise ValueError(
                    "'adjacency_matrix' was masked with a non symmetric mask")
            adjacency_matrix = adjacency_matrix.filled(0)

        if edge_threshold is not None:
            if isinstance(edge_threshold, _basestring):
                message = ("If 'edge_threshold' is given as a string it "
                           'should be a number followed by the percent sign, '
                           'e.g. "25.3%"')
                if not edge_threshold.endswith('%'):
                    raise ValueError(message)

                try:
                    percentile = float(edge_threshold[:-1])
                except ValueError as exc:
                    exc.args += (message, )
                    raise

                # Keep a percentile of edges with the highest absolute
                # values, so only need to look at the covariance
                # coefficients below the diagonal
                lower_diagonal_indices = np.tril_indices_from(adjacency_matrix,
                                                              k=-1)
                lower_diagonal_values = adjacency_matrix[
                    lower_diagonal_indices]
                edge_threshold = stats.scoreatpercentile(
                    np.abs(lower_diagonal_values), percentile)

            elif not isinstance(edge_threshold, numbers.Real):
                raise TypeError('edge_threshold should be either a number '
                                'or a string finishing with a percent sign')

            adjacency_matrix = adjacency_matrix.copy()
            threshold_mask = np.abs(adjacency_matrix) < edge_threshold
            adjacency_matrix[threshold_mask] = 0

        lower_triangular_adjacency_matrix = np.tril(adjacency_matrix, k=-1)
        non_zero_indices = lower_triangular_adjacency_matrix.nonzero()

        line_coords = [
            node_coords[list(index)] for index in zip(*non_zero_indices)
        ]

        adjacency_matrix_values = adjacency_matrix[non_zero_indices]
        for ax in self.axes.values():
            ax._add_markers(node_coords, node_color, node_size, **node_kwargs)
            ax._add_lines(line_coords, adjacency_matrix_values, edge_cmap,
                          **edge_kwargs)

        plt.draw_if_interactive()
Esempio n. 32
0
def setup():
    GG = nx.DiGraph()
    with open('traces/ripple/jan2013-lcc-t0.graph_CREDIT_LINKS', 'r') as f:
        for line in f:
            source = int(line.split()[0])
            destination = int(line.split()[1])
            total_channel_cap = (float(line.split()[3]) - float(
                line.split()[2])) + (float(line.split()[4]) -
                                     float(line.split()[3]))
            if total_channel_cap > 0:
                GG.add_edge(source,
                            destination,
                            capacity=total_channel_cap / 2)
                GG.add_edge(destination,
                            source,
                            capacity=total_channel_cap / 2)

    while True:
        nodes_to_remove = []
        for node_index in list(GG.nodes()):
            if len(list(GG.neighbors(node_index))) < 2:
                nodes_to_remove.append(node_index)

        if len(nodes_to_remove) == 0:
            break

        for node_index in nodes_to_remove:
            GG.remove_node(node_index)

    node_list = list(GG.nodes())

    random.seed(2)
    # make the node index be continuous
    G = nx.DiGraph()
    for e in GG.edges():
        G.add_edge(node_list.index(e[0]),
                   node_list.index(e[1]),
                   capacity=GG[e[0]][e[1]]['capacity'],
                   cost=random.random() * 10)
        G.add_edge(node_list.index(e[1]),
                   node_list.index(e[0]),
                   capacity=GG[e[1]][e[0]]['capacity'],
                   cost=random.random() * 10)

    # transaction fees for 10% edges are especially high
    random_edges = []
    random_edges = random.sample(xrange(G.number_of_edges()),
                                 int(G.number_of_edges() * 0.1))
    i = 0
    for e in G.edges():
        if i in random_edges:
            G[e[0]][e[1]]['cost'] = G[e[0]][e[1]]['cost'] * 10
        i += 1

    listC = []
    for e in G.edges():
        listC.append(G[e[0]][e[1]]['capacity'])
        listC.append(G[e[1]][e[0]]['capacity'])

    print "number of nodes", len(G)
    print 'average channel cap', float(sum(listC)) / len(listC)
    print 'num of edges', len(listC)

    sorted_var = np.sort(listC)
    print 'medium capacity', stats.scoreatpercentile(sorted_var, 50)

    trans = []
    with open('traces/ripple/ripple_val.csv', 'r') as f:
        csv_reader = csv.reader(f, delimiter=',')
        for row in csv_reader:
            if float(row[2]) > 0:
                src = int(row[0]) % len(G)
                dst = int(row[1]) % len(G)
                if src == dst:
                    continue
                trans.append((int(src), int(dst), float(row[2])))
    print 'num of transactions', len(trans)

    return G, trans
Esempio n. 33
0
    def fit(self,
            q=.5,
            vcov='robust',
            kernel='epa',
            bandwidth='hsheather',
            max_iter=1000,
            p_tol=1e-6,
            **kwargs):
        '''Solve by Iterative Weighted Least Squares

        Parameters
        ----------
        q : float
            Quantile must be between 0 and 1
        vcov : string, method used to calculate the variance-covariance matrix
            of the parameters. Default is ``robust``:

            - robust : heteroskedasticity robust standard errors (as suggested
              in Greene 6th edition)
            - iid : iid errors (as in Stata 12)

        kernel : string, kernel to use in the kernel density estimation for the
            asymptotic covariance matrix:

            - epa: Epanechnikov
            - cos: Cosine
            - gau: Gaussian
            - par: Parzene

        bandwidth: string, Bandwidth selection method in kernel density
            estimation for asymptotic covariance estimate (full
            references in QuantReg docstring):

            - hsheather: Hall-Sheather (1988)
            - bofinger: Bofinger (1975)
            - chamberlain: Chamberlain (1994)
        '''

        if q < 0 or q > 1:
            raise Exception('p must be between 0 and 1')

        kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
        if kernel not in kern_names:
            raise Exception("kernel must be one of " + ', '.join(kern_names))
        else:
            kernel = kernels[kernel]

        if bandwidth == 'hsheather':
            bandwidth = hall_sheather
        elif bandwidth == 'bofinger':
            bandwidth = bofinger
        elif bandwidth == 'chamberlain':
            bandwidth = chamberlain
        else:
            raise Exception(
                "bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")

        endog = self.endog
        exog = self.exog
        nobs = self.nobs
        exog_rank = np_matrix_rank(self.exog)
        self.rank = exog_rank
        self.df_model = float(self.rank - self.k_constant)
        self.df_resid = self.nobs - self.rank
        n_iter = 0
        xstar = exog

        beta = np.ones(exog_rank)
        # TODO: better start, initial beta is used only for convergence check

        # Note the following doesn't work yet,
        # the iteration loop always starts with OLS as initial beta
        #        if start_params is not None:
        #            if len(start_params) != rank:
        #                raise ValueError('start_params has wrong length')
        #            beta = start_params
        #        else:
        #            # start with OLS
        #            beta = np.dot(np.linalg.pinv(exog), endog)

        diff = 10
        cycle = False

        history = dict(params=[], mse=[])
        while n_iter < max_iter and diff > p_tol and not cycle:
            n_iter += 1
            beta0 = beta
            xtx = np.dot(xstar.T, exog)
            xty = np.dot(xstar.T, endog)
            beta = np.dot(pinv(xtx), xty)
            resid = endog - np.dot(exog, beta)

            mask = np.abs(resid) < .000001
            resid[mask] = np.sign(resid[mask]) * .000001
            resid = np.where(resid < 0, q * resid, (1 - q) * resid)
            resid = np.abs(resid)
            xstar = exog / resid[:, np.newaxis]
            diff = np.max(np.abs(beta - beta0))
            history['params'].append(beta)
            history['mse'].append(np.mean(resid * resid))

            if (n_iter >= 300) and (n_iter % 100 == 0):
                # check for convergence circle, shouldn't happen
                for ii in range(2, 10):
                    if np.all(beta == history['params'][-ii]):
                        cycle = True
                        break
                warnings.warn("Convergence cycle detected", ConvergenceWarning)

        if n_iter == max_iter:
            warnings.warn("Maximum number of iterations (1000) reached.",
                          IterationLimitWarning)

        e = endog - np.dot(exog, beta)
        # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
        # h = 0.9 * np.std(e) / (nobs**0.2)
        # Instead, we calculate bandwidth as in Stata 12
        iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
        h = bandwidth(nobs, q)
        h = min(np.std(endog),
                iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))

        fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))

        if vcov == 'robust':
            d = np.where(e > 0, (q / fhat0)**2, ((1 - q) / fhat0)**2)
            xtxi = pinv(np.dot(exog.T, exog))
            xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
            vcov = chain_dot(xtxi, xtdx, xtxi)
        elif vcov == 'iid':
            vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
        else:
            raise Exception("vcov must be 'robust' or 'iid'")

        lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)

        lfit.q = q
        lfit.iterations = n_iter
        lfit.sparsity = 1. / fhat0
        lfit.bandwidth = h
        lfit.history = history

        return RegressionResultsWrapper(lfit)
Esempio n. 34
0
def do_javelin(ra, dec):
    #ra and dec are converted to 6 decimals format
    ras = "%.6f" % ra
    decs = "%.6f" % dec

    print "########## computing Javelin for source located at ra=%f and dec=%f  ##########" % (
        ra, dec)

    try:

        #the diver lc is loaded, we take into account the different formats used for opt and NIR data.
        if driving_filter != 'Q':
            agn_driving = lc_path + driving_filter + '/agn_' + str(
                ras) + '_' + str(decs) + '_' + driving_filter + '.fits'
            arch_driving = pf.open(agn_driving)
            jd_0 = 55000
            head_driving = arch_driving[0].header
            datos_driving = arch_driving[1].data
            jd_driving = datos_driving['JD'] - jd_0
            flux_driving = datos_driving[
                'FLUX_2'] * 1e27  #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers
            errflux_driving = datos_driving['FLUXERR_2'] * 1e27
            zspec_driving = head_driving['REDSHIFT']

        else:
            try:
                agn_driving = lc_path + driving_filter + '/bin3_onechip_' + str(
                    ras) + '_' + str(decs) + '_' + field + '.fits'
                arch_driving = pf.open(agn_driving)
            except:
                agn_driving = lc_path + driving_filter + '/bin3_morechip_' + str(
                    ras) + '_' + str(decs) + '_' + field + '.fits'
                arch_driving = pf.open(agn_driving)
            jd_0 = 2455000
            head_driving = arch_driving[0].header
            datos_driving = arch_driving[1].data
            jd_driving = datos_driving['JD'] - jd_0
            flux_driving = datos_driving[
                'fluxQ'] * 1e27  #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers
            errflux_driving = datos_driving['errfluxQ'] * 1e27
            zspec_driving = head_driving['REDSHIFT']

        lcd_name = 'temp/driving_lc_' + driving_filter + '_' + str(
            ras) + '_' + str(decs) + '.txt'
        np.savetxt(
            lcd_name,
            np.transpose([
                jd_driving / (1.0 + zspec_driving), flux_driving,
                errflux_driving
            ]))

        try:

            #reading the responding filter data
            if responding_filter != 'Q':
                agn_responding = lc_path + responding_filter + '/agn_' + str(
                    ras) + '_' + str(decs) + '_' + responding_filter + '.fits'
                arch_responding = pf.open(agn_responding)
                jd_0 = 55000
                head_responding = arch_responding[0].header
                datos_responding = arch_responding[1].data
                jd_responding = datos_responding['JD'] - jd_0
                flux_responding = datos_responding[
                    'FLUX_2'] * 1e27  #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers
                errflux_responding = datos_responding['FLUXERR_2'] * 1e27
                zspec_responding = head_responding['REDSHIFT']
            else:
                try:
                    agn_driving = lc_path + responding_filter + '/bin3_onechip_' + str(
                        ras) + '_' + field + '.fits'
                    arch_responding = pf.open(agn_responding)
                except:
                    agn_driving = lc_path + responding_filter + '/bin3_morechip_' + str(
                        ras) + '_' + field + '.fits'
                    arch_responding = pf.open(agn_responding)
                jd_0 = 2455000
                head_responding = arch_responding[0].header
                datos_responding = arch_responding[1].data
                jd_responding = datos_responding['JD'] - jd_0
                flux_responding = datos_responding[
                    'fluxQ'] * 1e27  #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers
                errflux_responding = datos_responding['errfluxQ'] * 1e27
                zspec_responding = head_responding['REDSHIFT']

            #converting lcs into a format accepted by the fortran method
            lcr_name = 'temp/responding_lc_' + responding_filter + '_' + str(
                ras) + '_' + str(decs) + '.txt'
            np.savetxt(
                lcr_name,
                np.transpose([
                    jd_responding / (1.0 + zspec_driving), flux_responding,
                    errflux_responding
                ]))

            #running Javelin
            cont = get_data([lcd_name], names=[driving_filter])
            cmod = Cont_Model(cont)
            cmod.do_mcmc(nwalkers=100,
                         nburn=50,
                         nchain=100,
                         fchain=jav_stat_path + "chain_cont_" +
                         driving_filter + "_vs_" + responding_filter + "_" +
                         str(ras) + "_" + str(decs) + ".txt")

            bothdata = get_data([lcd_name, lcr_name],
                                names=[driving_filter, responding_filter])
            mod_2band = Pmap_Model(bothdata)  #Rmap_Model(bothdata)
            mod_2band.do_mcmc(nwalkers=100,
                              nburn=50,
                              nchain=100,
                              conthpd=cmod.hpd,
                              laglimit=[[cent_lowlimit, cent_uplimit]],
                              widlimit=widlimit,
                              fchain=jav_stat_path + "jav_chain_all_" +
                              driving_filter + "_vs_" + responding_filter +
                              "_" + str(ras) + "_" + str(decs) + ".txt")

            sigma, tau, lag, width, scale = np.loadtxt(
                jav_stat_path + "jav_chain_all_" + driving_filter + "_vs_" +
                responding_filter + "_" + str(ras) + "_" + str(decs) + ".txt",
                unpack=True,
                usecols=[0, 1, 2, 3, 4])

            centau_median = np.median(lag)
            centau_uperr = (stats.scoreatpercentile(lag,
                                                    perclim)) - centau_median
            centau_loerr = centau_median - (stats.scoreatpercentile(
                lag, (100. - perclim)))
            len_chain = len(lag[np.where(lag > -2000000000000)])

            return (ra, dec, zspec_driving, len_chain, centau_median,
                    centau_loerr, centau_uperr)

        except:
            print "########## computing iccf FAILS for source located at ra=%f and dec=%f, NO RESPONDING LC available  ##########" % (
                ra, dec)
            cmd = 'rm ' + lcd_name
            os.system(cmd)
            return (ra, dec, -9999, -9999, -9999, -9999, -9999)
    except:

        print "########## computing iccf FAILS for source located at ra=%f and dec=%f, NO DRIVING LC available  ##########" % (
            ra, dec)

        return (ra, dec, -9999, -9999, -9999, -9999, -9999)
Esempio n. 35
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s [%(levelname)s] %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    logging.info('*** Starting: Compute CC ***')

    # Connection to the DB
    db = connect()

    if len(get_filters(db, all=False)) == 0:
        logging.info("NO FILTERS DEFINED, exiting")
        sys.exit()

    # Get Configuration
    params = get_params(db)
    filters = get_filters(db, all=False)
    logging.info("Will compute %s" % " ".join(params.components_to_compute))

    if params.remove_response:
        logging.debug('Pre-loading all instrument response')
        responses = preload_instrument_responses(db)
    else:
        responses = None
    logging.info("Checking if there are jobs to do")
    while is_next_job(db, jobtype='CC'):
        logging.info("Getting the next job")
        jobs = get_next_job(db, jobtype='CC')

        if len(jobs) == 0:
            # edge case, should only occur when is_next returns true, but
            # get_next receives no jobs (heavily parallelised code)
            continue

        stations = []
        pairs = []
        refs = []

        for job in jobs:
            refs.append(job.ref)
            pairs.append(job.pair)
            netsta1, netsta2 = job.pair.split(':')
            stations.append(netsta1)
            stations.append(netsta2)
            goal_day = job.day

        stations = np.unique(stations)

        logging.info("New CC Job: %s (%i pairs with %i stations)" %
                     (goal_day, len(pairs), len(stations)))
        jt = time.time()

        comps = []
        for comp in params.components_to_compute:
            if comp[0] in ["R", "T"] or comp[1] in ["R", "T"]:
                comps.append("E")
                comps.append("N")
            else:
                comps.append(comp[0])
                comps.append(comp[1])
        comps = np.unique(comps)
        stream = preprocess(db, stations, comps, goal_day, params, responses)
        if not len(stream):
            logging.info("Not enough data for this day !")
            logging.info("Marking job Done and continuing with next !")
            for job in jobs:
                update_job(db, job.day, job.pair, 'CC', 'D', ref=job.ref)
            continue
        # print '##### STREAMS ARE ALL PREPARED AT goal Hz #####'
        dt = 1. / params.goal_sampling_rate
        logging.info("Starting slides")
        start_processing = time.time()
        allcorr = {}
        for tmp in stream.slide(params.corr_duration,
                                params.corr_duration * (1 - params.overlap)):
            logging.info("Processing %s - %s" %
                         (tmp[0].stats.starttime, tmp[0].stats.endtime))
            tmp = tmp.copy().sort()

            channels_to_remove = []
            for gap in tmp.get_gaps(min_gap=0):
                if gap[-2] > 0:
                    channels_to_remove.append(".".join(
                        [gap[0], gap[1], gap[2], gap[3]]))

            for chan in np.unique(channels_to_remove):
                logging.debug("%s contains gap(s), removing it" % chan)
                net, sta, loc, chan = chan.split(".")
                for tr in tmp.select(network=net,
                                     station=sta,
                                     location=loc,
                                     channel=chan):
                    tmp.remove(tr)
            if len(tmp) == 0:
                logging.debug("No traces without gaps")
                continue

            base = np.amax([tr.stats.npts for tr in tmp])
            if base <= (params.maxlag * params.goal_sampling_rate * 2 + 1):
                logging.debug("All traces shorter are too short to export"
                              " +-maxlag")
                continue

            for tr in tmp:
                if tr.stats.npts != base:
                    tmp.remove(tr)
                    logging.debug("One trace is too short, removing it")

            if len(tmp) == 0:
                logging.debug("No traces left in slice")
                continue

            nfft = next_fast_len(tmp[0].stats.npts)
            tmp.detrend("demean")

            for tr in tmp:
                if params.windsorizing == -1:
                    np.sign(tr.data, tr.data)  # inplace
                elif params.windsorizing != 0:
                    imin, imax = scoreatpercentile(tr.data, [1, 99])
                    not_outliers = np.where((tr.data >= imin)
                                            & (tr.data <= imax))[0]
                    rms = tr.data[not_outliers].std() * params.windsorizing
                    np.clip(tr.data, -rms, rms, tr.data)  # inplace
            # TODO should not hardcode 4 percent!
            tmp.taper(0.04)

            # TODO should not hardcode 100 taper points in spectrum
            napod = 100

            data = np.asarray([tr.data for tr in tmp])
            names = [tr.id.split(".") for tr in tmp]

            # index net.sta comps for energy later
            channel_index = {}
            psds = []
            for i, name in enumerate(names):
                n1, s1, l1, c1 = name
                netsta = "%s.%s" % (n1, s1)
                if netsta not in channel_index:
                    channel_index[netsta] = {}
                channel_index[netsta][c1[-1]] = i

                pxx, freqs = mlab.psd(tmp[i].data,
                                      Fs=tmp[i].stats.sampling_rate,
                                      NFFT=nfft,
                                      detrend='mean')
                psds.append(np.sqrt(pxx))
            psds = np.asarray(psds)

            for chan in channel_index:
                comps = channel_index[chan].keys()
                if "E" in comps and "N" in comps:
                    i_e = channel_index[chan]["E"]
                    i_n = channel_index[chan]["N"]
                    # iZ = channel_index[chan]["Z"]
                    mm = psds[[i_e, i_n]].mean(axis=0)
                    psds[i_e] = mm
                    psds[i_n] = mm
                    # psds[iZ] = mm

            # define pairwise CCs
            tmptime = tmp[0].stats.starttime.datetime
            thisdate = tmptime.strftime("%Y-%m-%d")
            thistime = tmptime.strftime("%Y-%m-%d %H:%M:%S")
            pair_index = []

            # Different iterator func if autocorr:
            if params.autocorr:
                iterfunc = itertools.combinations_with_replacement
            else:
                iterfunc = itertools.combinations
            for sta1, sta2 in iterfunc(names, 2):
                n1, s1, l1, c1 = sta1
                n2, s2, l2, c2 = sta2
                comp = "%s%s" % (c1[-1], c2[-1])
                if comp in params.components_to_compute:
                    pair_index.append([
                        "%s.%s_%s.%s_%s" % (n1, s1, n2, s2, comp),
                        names.index(sta1),
                        names.index(sta2)
                    ])

            for filterdb in filters:
                filterid = filterdb.ref
                low = float(filterdb.low)
                high = float(filterdb.high)

                freq_vec = scipy.fftpack.fftfreq(nfft, d=dt)[:nfft // 2]
                freq_sel = np.where((freq_vec >= low) & (freq_vec <= high))[0]
                low = freq_sel[0] - napod
                if low <= 0:
                    low = 1
                p1 = freq_sel[0]
                p2 = freq_sel[-1]
                high = freq_sel[-1] + napod
                if high > nfft / 2:
                    high = int(nfft // 2)

                ffts = scipy.fftpack.fftn(data, shape=[
                    nfft,
                ], axes=[
                    1,
                ])
                # TODO: AC will require a more clever handling, no whiten...
                whiten2(ffts, nfft, low, high, p1, p2, psds,
                        params.whitening)  # inplace
                # energy = np.sqrt(np.sum(np.abs(ffts)**2, axis=1)/nfft)
                energy = np.real(
                    np.sqrt(
                        np.mean(scipy.fftpack.ifft(ffts, n=nfft, axis=1)**2,
                                axis=1)))

                # logging.info("Pre-whitened %i traces"%(i+1))
                corr = myCorr2(ffts,
                               np.ceil(params.maxlag / dt),
                               energy,
                               pair_index,
                               plot=False,
                               nfft=nfft)

                for key in corr:
                    ccfid = key + "_%02i" % filterid + "_" + thisdate
                    if ccfid not in allcorr:
                        allcorr[ccfid] = {}
                    allcorr[ccfid][thistime] = corr[key]
                del corr

        # Needed to clean the FFT memory caching of SciPy
        clean_scipy_cache()

        if params.keep_all:
            for ccfid in allcorr.keys():
                export_allcorr2(db, ccfid, allcorr[ccfid])

        if params.keep_days:
            for ccfid in allcorr.keys():
                station1, station2, components, filterid, date = \
                    ccfid.split('_')

                corrs = np.asarray(list(allcorr[ccfid].values()))
                if not len(corrs):
                    logging.debug("No data to stack.")
                    continue
                corr = stack(corrs, params.stack_method, params.pws_timegate,
                             params.pws_power, params.goal_sampling_rate)
                if not len(corr):
                    logging.debug("No data to save.")
                    continue
                thisdate = goal_day
                thistime = "0_0"
                add_corr(db,
                         station1.replace('.', '_'),
                         station2.replace('.', '_'),
                         int(filterid),
                         thisdate,
                         thistime,
                         params.min30 / params.goal_sampling_rate,
                         components,
                         corr,
                         params.goal_sampling_rate,
                         day=True,
                         ncorr=corrs.shape[0],
                         params=params)

        # THIS SHOULD BE IN THE API
        massive_update_job(db, jobs, "D")

        for job in jobs:
            update_job(db, job.day, job.pair, 'STACK', 'T')

        logging.info(
            "Job Finished. It took %.2f seconds (preprocess: %.2f s & "
            "process %.2f s)" % ((time.time() - jt), start_processing - jt,
                                 time.time() - start_processing))
        del stream
    logging.info('*** Finished: Compute CC ***')
Esempio n. 36
0
def do_iccf(source_name, source_ID, zspec_driving, jd_driving, flux_driving,
            errflux_driving, jd_responding, flux_responding,
            errflux_responding):
    #Calculate lag with python CCF program

    print "########## computing ICCF for source %d  ##########" % (source_ID)

    tlag_peak, status_peak, tlag_centroid, status_centroid, ccf_pack, max_rval, status_rval, pval = myccf.peakcent(
        jd_driving / (zspec_driving + 1.0), flux_driving,
        jd_responding / (zspec_driving + 1.0), flux_responding, lag_range[0],
        lag_range[1], interp)
    tlags_peak, tlags_centroid, nsuccess_peak, nfail_peak, nsuccess_centroid, nfail_centroid, max_rvals, nfail_rvals, pvals = myccf.xcor_mc(
        jd_driving / (zspec_driving + 1.0),
        flux_driving,
        abs(errflux_driving),
        jd_responding / (zspec_driving + 1.0),
        flux_responding,
        abs(errflux_responding),
        lag_range[0],
        lag_range[1],
        interp,
        nsim=nsim,
        mcmode=mcmode,
        sigmode=sigmode)

    lag = ccf_pack[1]
    r = ccf_pack[0]

    ###Calculate the best peak and centroid and their uncertainties using the median of the
    ##distributions.
    centau = stats.scoreatpercentile(tlags_centroid, 50)
    centau_uperr = (stats.scoreatpercentile(tlags_centroid, perclim)) - centau
    centau_loerr = centau - (stats.scoreatpercentile(tlags_centroid,
                                                     (100. - perclim)))
    print 'Centroid, error: %10.3f  (+%10.3f -%10.3f)' % (centau, centau_uperr,
                                                          centau_loerr)
    print "centroid org:", tlag_centroid
    peaktau = stats.scoreatpercentile(tlags_peak, 50)
    peaktau_uperr = (stats.scoreatpercentile(tlags_peak, perclim)) - peaktau
    peaktau_loerr = peaktau - (stats.scoreatpercentile(tlags_peak,
                                                       (100. - perclim)))
    print 'Peak, errors: %10.3f  (+%10.3f -%10.3f)' % (peaktau, peaktau_uperr,
                                                       peaktau_loerr)
    print "peak org:", tlag_peak

    #Write results out to a file in case we want them later.
    centfile = open(
        'iccf_stat/centdist_iccf_dt' + str(interp) + '_' + source_name +
        '.txt', 'w')
    peakfile = open(
        'iccf_stat/peakdist_iccf_dt' + str(interp) + '_' + source_name +
        '.txt', 'w')
    ccf_file = open(
        'iccf_stat/org_iccf_dt' + str(interp) + '_' + source_name + '.txt',
        'w')
    for m in xrange(0, np.size(tlags_centroid)):
        centfile.write('%5.5f    \n' % (tlags_centroid[m]))
    centfile.close()
    for m in xrange(0, np.size(tlags_peak)):
        peakfile.write('%5.5f    \n' % (tlags_peak[m]))
    peakfile.close()
    for m in xrange(0, np.size(lag)):
        ccf_file.write('%5.5f    %5.5f  \n' % (lag[m], r[m]))
    ccf_file.close()

    return (source_ID, zspec_driving, nsuccess_peak, tlag_peak, peaktau,
            peaktau_loerr, peaktau_uperr, nsuccess_centroid, tlag_centroid,
            centau, centau_loerr, centau_uperr)
Esempio n. 37
0
def pairwise_bootstrap_plot(theta_est,
                            theta_star,
                            alpha,
                            axis_limits=None,
                            filename=None):
    """
    Plot pairwise relationship for theta estimates along with confidence 
    intevals using multivariate normal and kernel density estimate distributions
    
    Parameters
    ----------
    theta_est: `pandas DataFrame` (columns = variable names)
        Theta estimate (returned by parmest.bootstrap). If the DataFrame 
        contains column names 'samples', these will not be included in the plot.
    theta_star: `dict` or `pandas Series` (index = variable names)
        Theta*
    alpha: `float`
        Confidence interval
    axis_limits: `dict` or `pandas Series` (optional)
        Axis limits in the format {variable: [min, max]}
    filename: `string` (optional)
        Filename used to save the figure
    
    Returns
    --------
    Mutlivariate normal distribution (scipy.stats.multivariate_normal), 
    gaussian kde distribution (scipy.stats.gaussian_kde)
    """
    if 'samples' in theta_est.columns:
        theta_est = theta_est.drop('samples', axis=1)
    if isinstance(theta_star, dict):
        theta_star = pd.Series(theta_star)

    m = theta_est.mean()
    c = theta_est.cov()
    mvn_dist = stats.multivariate_normal(m, c, allow_singular=True)
    mvnZ = mvn_dist.pdf(theta_est)
    mvn_score = stats.scoreatpercentile(mvnZ.transpose(), (1 - alpha) * 100)

    kde_dist = stats.gaussian_kde(
        theta_est.transpose().values)  # data.shape = (#dim, #data)
    kdeZ = kde_dist.pdf(theta_est.transpose())
    kde_score = stats.scoreatpercentile(kdeZ.transpose(), (1 - alpha) * 100)

    columns = theta_est.columns
    ncells = 100

    g = sns.PairGrid(theta_est)
    g.map_diag(sns.distplot, kde=False, hist=True, norm_hist=False)
    #g.map_diag(sns.distplot, fit=stats.norm, hist=False,  fit_kws={'color': 'b'}) #, kde=False, norm_hist=False) # histogram and kde estimate
    #g.map_diag(sns.kdeplot) #, color='r')

    g.map_upper(_add_scatter, columns=columns, theta_star=theta_star)
    g.map_lower(_add_scatter, columns=columns, theta_star=theta_star)

    g.map_lower(_add_rectangle_CI, columns=columns, alpha=alpha)
    g.map_lower(_add_multivariate_normal_CI,
                columns=columns,
                ncells=ncells,
                alpha=mvn_score,
                mvn_dist=mvn_dist,
                theta_star=theta_star)
    g.map_lower(_add_gaussian_kde_CI,
                columns=columns,
                ncells=ncells,
                alpha=kde_score,
                kde_dist=kde_dist,
                theta_star=theta_star)

    if axis_limits is not None:
        for ax in g.fig.get_axes():
            xvar, yvar, (xloc, yloc) = _get_variables(ax, columns)
            if xloc != yloc:  # not on diagonal
                ax.set_ylim(axis_limits[yvar])
                ax.set_xlim(axis_limits[xvar])
    if filename is not None:
        plt.savefig(filename)

    return mvn_dist, kde_dist
Esempio n. 38
0
data = pd.read_csv('data/Chapter9_Data.csv',
                   parse_dates=True,
                   index_col='date')

returns = data.apply(np.log) - data.apply(np.log).shift()
returns.dropna(inplace=True)
returns *= scale
returns.plot()

percentile = range(15, 86)

cor0 = pd.DataFrame(index=percentile, columns=['cor'])

for p in percentile:

    score_sp = stats.scoreatpercentile(returns['sp'], p)
    score_tn = stats.scoreatpercentile(returns['tn'], p)

    if p <= 50:
        cut = returns.loc[(returns['sp'] <= score_sp) &
                          (returns['tn'] <= score_tn), ]

        cor_num = stats.pearsonr(cut['sp'], cut['tn'])

        cor0.loc[p, 'cor'] = cor_num[0]
    else:
        cut = returns.loc[(returns['sp'] > score_sp) &
                          (returns['tn'] > score_tn), ]

        cor_num = stats.pearsonr(cut['sp'], cut['tn'])
######################################################################
# Generate figures
# ----------------

with warnings.catch_warnings():
    warnings.simplefilter('ignore', DeprecationWarning)

    for index, (ic_map, ic_terms) in enumerate(
            zip(ica_maps, term_weights_for_components)):
        if -ic_map.min() > ic_map.max():
            # Flip the map's sign for prettiness
            ic_map = -ic_map
            ic_terms = -ic_terms

        ic_threshold = stats.scoreatpercentile(np.abs(ic_map), 90)
        ic_img = masker.inverse_transform(ic_map)
        important_terms = vocabulary[np.argsort(ic_terms)[-3:]]
        title = 'IC%i  %s' % (index, ', '.join(important_terms[::-1]))

        plotting.plot_stat_map(ic_img,
                               threshold=ic_threshold,
                               colorbar=False,
                               title=title)

######################################################################
# As we can see, some of the components capture cognitive or neurological
# maps, while other capture noise in the database. More data, better
# filtering, and better cognitive labels would give better maps

# Done.
Esempio n. 40
0
    def conditionsPlot(self, results):
        # summarize results for each experimental condition  
        print('  Tabulating results for each experimental condition using marker sets.')
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        compOutliers = defaultdict(list)
        contOutliers = defaultdict(list)
        
        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            compDataDict[expCondStr]['best'] += results[simId][2]
            compDataDict[expCondStr]['domain'] += results[simId][6]
            compDataDict[expCondStr]['selected'] += results[simId][10]
            
            for dComp in results[simId][2]:
                compOutliers[expCondStr] += [[dComp, genomeId]]
            
            contDataDict[expCondStr]['best'] += results[simId][3]
            contDataDict[expCondStr]['domain'] += results[simId][7]
            contDataDict[expCondStr]['selected'] += results[simId][11]
            
            for dCont in results[simId][3]:
                contOutliers[expCondStr] += [[dCont, genomeId]]
                
        print('  There are %d unique genomes.' % len(genomeIds))
              
        sys.stdout.write('\n')
        
        print('    There are %d experimental conditions.' % (len(compDataDict)))
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        
        foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w')
        foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w')
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for msStr in ['best', 'selected', 'domain']:
                    for seqLen in [20000]: 
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                    
                # report completenes outliers
                foutComp.write(expCondStr)

                compOutliers[expCondStr].sort()
                
                dComps = array([r[0] for r in compOutliers[expCondStr]])
                perc1 = scoreatpercentile(dComps, 1)
                perc99 = scoreatpercentile(dComps, 99)
                print(expCondStr, perc1, perc99)
                
                foutComp.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in compOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutComp.write('\t' + genomeId + ': ' + str(count))
                foutComp.write('\n')
                
                # report contamination outliers
                foutCont.write(expCondStr)

                contOutliers[expCondStr].sort()
                
                dConts = array([r[0] for r in contOutliers[expCondStr]])
                perc1 = scoreatpercentile(dConts, 1)
                perc99 = scoreatpercentile(dConts, 99)
                
                foutCont.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in contOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutCont.write('\t' + genomeId + ': ' + str(count))
                foutCont.write('\n')
                
        foutComp.close()
        foutCont.close()
                               
        print('best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3])))))
        print('selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3])))))   
        print('domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3])))))   

        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.conditions.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
        
        # print table of results 
        tableOut = open(self.simCompareConditionOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                   
                    meanCompD = mean(abs(array(compDataDict[expCondStr]['domain'])))
                    stdCompD = std(abs(array(compDataDict[expCondStr]['domain'])))
                    meanContD = mean(abs(array(contDataDict[expCondStr]['domain'])))
                    stdContD = std(abs(array(contDataDict[expCondStr]['domain'])))
                    
                    avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain']
                    avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain']
                    
                    meanCompS = mean(abs(array(compDataDict[expCondStr]['selected'])))
                    stdCompS = std(abs(array(compDataDict[expCondStr]['selected'])))
                    meanContS = mean(abs(array(contDataDict[expCondStr]['selected'])))
                    stdContS = std(abs(array(contDataDict[expCondStr]['selected'])))
                    
                    avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected']
                    avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected']
                    
                    meanCompB = mean(abs(array(compDataDict[expCondStr]['best'])))
                    stdCompB = std(abs(array(compDataDict[expCondStr]['best'])))
                    meanContB = mean(abs(array(contDataDict[expCondStr]['best'])))
                    stdContB = std(abs(array(contDataDict[expCondStr]['best'])))
                    
                    avgComp[seqLen]['best'] += compDataDict[expCondStr]['best']
                    avgCont[seqLen]['best'] += contDataDict[expCondStr]['best']
                    
                    tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompD = mean(abs(array(avgComp[seqLen]['domain'])))
            stdCompD = std(abs(array(avgComp[seqLen]['domain'])))
            meanContD = mean(abs(array(avgCont[seqLen]['domain'])))
            stdContD = std(abs(array(avgCont[seqLen]['domain'])))
            
            meanCompS = mean(abs(array(avgComp[seqLen]['selected'])))
            stdCompS = std(abs(array(avgComp[seqLen]['selected'])))
            meanContS = mean(abs(array(avgCont[seqLen]['selected'])))
            stdContS = std(abs(array(avgCont[seqLen]['selected'])))
            
            meanCompB = mean(abs(array(avgComp[seqLen]['best'])))
            stdCompB = std(abs(array(avgComp[seqLen]['best'])))
            meanContB = mean(abs(array(avgCont[seqLen]['best'])))
            stdContB = std(abs(array(avgCont[seqLen]['best'])))
            
            tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                        
        tableOut.write('\n')     
                
        tableOut.close()
Esempio n. 41
0
#! /usr/bin/python
#-*-coding:utf-8 -*-

import numpy as np
import scipy.stats as sta

data = np.random.random_sample(900)

# print data

Mean,std = sta.norm.fit(data)

print (Mean , std)

print (sta.skewtest(data))
print (sta.kurtosistest(data))
print (sta.normaltest(data))
print (sta.scoreatpercentile(data,50))
print (sta.scoreatpercentile(data,1))

import matplotlib.pyplot as plt

plt.hist(data)
plt.show()
Esempio n. 42
0
def _filter_ridge_lines(cwt,
                        ridge_lines,
                        window_size=None,
                        min_length=None,
                        min_snr=1,
                        noise_perc=10):
    """
    Filter ridge lines according to prescribed criteria. Intended
    to be used for finding relative maxima.

    Parameters
    ----------
    cwt : 2-D ndarray
        Continuous wavelet transform from which the `ridge_lines` were defined.
    ridge_lines : 1-D sequence
        Each element should contain 2 sequences, the rows and columns
        of the ridge line (respectively).
    window_size : int, optional
        Size of window to use to calculate noise floor.
        Default is ``cwt.shape[1] / 20``.
    min_length : int, optional
        Minimum length a ridge line needs to be acceptable.
        Default is ``cwt.shape[0] / 4``, ie 1/4-th the number of widths.
    min_snr : float, optional
        Minimum SNR ratio. Default 1. The signal is the value of
        the cwt matrix at the shortest length scale (``cwt[0, loc]``), the
        noise is the `noise_perc`th percentile of datapoints contained within a
        window of `window_size` around ``cwt[0, loc]``.
    noise_perc : float, optional
        When calculating the noise floor, percentile of data points
        examined below which to consider noise. Calculated using
        scipy.stats.scoreatpercentile.

    References
    ----------
    Bioinformatics (2006) 22 (17): 2059-2065. :doi:`10.1093/bioinformatics/btl355`
    http://bioinformatics.oxfordjournals.org/content/22/17/2059.long

    """
    num_points = cwt.shape[1]
    if min_length is None:
        min_length = np.ceil(cwt.shape[0] / 4)
    if window_size is None:
        window_size = np.ceil(num_points / 20)

    window_size = int(window_size)
    hf_window, odd = divmod(window_size, 2)

    # Filter based on SNR
    row_one = cwt[0, :]
    noises = np.zeros_like(row_one)
    for ind, val in enumerate(row_one):
        window_start = max(ind - hf_window, 0)
        window_end = min(ind + hf_window + odd, num_points)
        noises[ind] = scoreatpercentile(row_one[window_start:window_end],
                                        per=noise_perc)

    def filt_func(line):
        if len(line[0]) < min_length:
            return False
        snr = abs(cwt[line[0][0], line[1][0]] / noises[line[1][0]])
        if snr < min_snr:
            return False
        return True

    return list(filter(filt_func, ridge_lines))
Esempio n. 43
0
        ".h5")  #fetch h5 file to allow faster preprocessing
    keys[idx], modes[idx] = ut.get_key_feature(track, h5)
    loudnesses[idx], loudnesses_var[idx], loudnesses_interval[
        idx] = ut.get_loudness(track, h5)
    tempos[idx] = ut.get_tempo_feature(track, h5)
    time_signatures[idx] = ut.get_time_signature(track, h5)
    timbre_means[idx], timbre_vars[idx], timbre_median[idx], timbre_min[
        idx], timbre_max[idx] = ut.get_timbre(track, h5)
    pitches_means[idx], pitches_vars[idx], pitches_median[idx], pitches_min[
        idx], pitches_max[idx] = ut.get_pitches(track, h5)
    energies[idx] = ut.get_energy_feature(track)
    h5.close()

#use binning for continious data
#problem: number of bins => freedman-driaconis rule
num_bins = 2 * (stats.scoreatpercentile(loudnesses_interval, 75) -
                stats.scoreatpercentile(loudnesses_interval, 25)
                ) * len(loudnesses_interval)**(1 / 3)
bins = np.linspace(min(loudnesses_interval),
                   max(loudnesses_interval),
                   num=num_bins)
d_loudnesses_interval = np.digitize(loudnesses_interval, bins)

num_bins = 2 * (stats.scoreatpercentile(loudnesses, 75) -
                stats.scoreatpercentile(loudnesses, 25)) * len(loudnesses)**(
                    1 / 3)
bins = np.linspace(min(loudnesses), max(loudnesses), num=100)
d_loudnesses = np.digitize(loudnesses, bins)

num_bins = 2 * (stats.scoreatpercentile(tempos, 75) -
                stats.scoreatpercentile(tempos, 25)) * len(tempos)**(1 / 3)
Esempio n. 44
0
def redsker(b, idx, err=True):
    depth = 12
    h = es.htm.HTM(depth)
    ra = b.field('ra')
    dec = b.field('dec')
    photoz = b.field('z')
    central = b.field('central')
    gmr = b.field('omag')[:, 0] - b.field('omag')[:, 1]
    rmi = b.field('omag')[:, 1] - b.field('omag')[:, 2]
    imz = b.field('omag')[:, 2] - b.field('omag')[:, 3]
    gmz = b.field('omag')[:, 0] - b.field('omag')[:, 3]
    rmz = b.field('omag')[:, 1] - b.field('omag')[:, 3]
    gmi = b.field('omag')[:, 0] - b.field('omag')[:, 2]
    num = len(ra)
    if err:
        gmrerr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 1]
        rmierr = b.field('omagerr')[:, 1] - b.field('omagerr')[:, 2]
        imzerr = b.field('omagerr')[:, 2] - b.field('omagerr')[:, 3]
        gmzerr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 3]
        rmzerr = b.field('omagerr')[:, 1] - b.field('omagerr')[:, 3]
        gmierr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 2]
    else:
        gmrerr = np.zeros(num)
        rmierr = np.zeros(num)
        imzerr = np.zeros(num)
        gmzerr = np.zeros(num)
        rmzerr = np.zeros(num)
        gmierr = np.zeros(num)
    iamag = b.field('amag')[:, 2]
    imag = b.field('omag')[:, 2]
    srad = np.rad2deg(1. / es.cosmology.Da(0, photoz[idx], h=0.7) /
                      (1 + photoz[idx]))
    m1, m2, d12 = h.match(ra[idx], dec[idx], ra, dec, srad, maxmatch=5000)
    indices = (imag[m2] <= limi(photoz[idx])) * (imag[m2] > imag[m1])
    #indices=(iamag[m2]<=-20)*(iamag[m2]>iamag[m1])
    ntot = len(m2[indices])
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(gmr[m2[indices]], per=80),
        sts.scoreatpercentile(gmr[m2[indices]], per=30)
    ])
    sigma = np.array([0.04, 0.3])
    aic2 = gmm.aic_ecgmm(gmr[m2[indices]], gmrerr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(gmr[m2[indices]], gmrerr[m2[indices]])[3]
    fig = pl.figure(figsize=(15, 8))
    ax = fig.add_subplot(2, 3, 1)
    pl.hist(gmr[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('g - r')
    pl.title('M200: ' + str(b[idx].field('m200')))
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(rmi[m2[indices]], per=80),
        sts.scoreatpercentile(rmi[m2[indices]], per=30)
    ])
    sigma = np.array([0.04, 0.3])
    aic2 = gmm.aic_ecgmm(rmi[m2[indices]], rmierr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(rmi[m2[indices]], rmierr[m2[indices]])[3]
    ax = fig.add_subplot(2, 3, 2)
    pl.hist(rmi[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('r - i')
    pl.title('photoz: ' + str(photoz[idx]))
    pl.xlim(-0.2, 2.5)
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(imz[m2[indices]], per=60),
        sts.scoreatpercentile(imz[m2[indices]], per=30)
    ])
    sigma = np.array([0.02, 0.3])
    aic2 = gmm.aic_ecgmm(imz[m2[indices]], imzerr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(imz[m2[indices]], imzerr[m2[indices]])[3]
    ax = fig.add_subplot(2, 3, 3)
    pl.hist(imz[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('i - z')
    pl.title('Ntot: ' + str(ntot))
    pl.xlim(-0.2, 2.5)
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(gmz[m2[indices]], per=60),
        sts.scoreatpercentile(gmz[m2[indices]], per=30)
    ])
    sigma = np.array([0.02, 0.3])
    aic2 = gmm.aic_ecgmm(gmz[m2[indices]], gmzerr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(gmz[m2[indices]], gmzerr[m2[indices]])[3]
    ax = fig.add_subplot(2, 3, 4)
    pl.hist(gmz[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('g - z')
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(rmz[m2[indices]], per=60),
        sts.scoreatpercentile(rmz[m2[indices]], per=30)
    ])
    sigma = np.array([0.02, 0.3])
    aic2 = gmm.aic_ecgmm(rmz[m2[indices]], rmzerr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(rmz[m2[indices]], rmzerr[m2[indices]])[3]
    ax = fig.add_subplot(2, 3, 5)
    pl.hist(rmz[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('r - z')
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    alpha = np.array([0.5, 0.5])
    mu = np.array([
        sts.scoreatpercentile(gmi[m2[indices]], per=60),
        sts.scoreatpercentile(gmi[m2[indices]], per=30)
    ])
    sigma = np.array([0.02, 0.3])
    aic2 = gmm.aic_ecgmm(gmi[m2[indices]], gmierr[m2[indices]], alpha, mu,
                         sigma)
    aic1 = gmm.wstat(gmi[m2[indices]], gmierr[m2[indices]])[3]
    ax = fig.add_subplot(2, 3, 6)
    pl.hist(gmi[m2[indices]], bins=30, normed=True, histtype='step')
    x = np.arange(-1, 5, 0.01)
    t = gmm.ecgmmplot(x, alpha, mu, sigma)
    pl.xlabel('g - i')
    pl.text(0.1,
            0.85,
            r'$\alpha$: ' + str(np.round(alpha, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.8,
            r'$\mu$: ' + str(np.round(mu, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.75,
            r'$\sigma$: ' + str(np.round(sigma, 4)),
            transform=ax.transAxes)
    pl.text(0.1,
            0.68,
            r'$Ngals$: ' + str(np.round(ntot * alpha[0])),
            transform=ax.transAxes)
    pl.text(0.1,
            0.6,
            r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)),
            transform=ax.transAxes)
    return ('Plot is done!')
Esempio n. 45
0
 def scoreatpercentile(cum_preds, p):
     return [stats.scoreatpercentile(c, p) for c in cum_preds.T]
def crop(f, a, b):
    from scipy.stats import scoreatpercentile
    s1 = scoreatpercentile(f, a)
    s2 = scoreatpercentile(f, 100 - b)
    assert s1 <= s2
    return np.logical_and(f >= s1, f <= s2)
Esempio n. 47
0
def plot_best(trace=None,
              data_train=None,
              data_test=None,
              samples=1000,
              burn=200,
              axs=None):
    """Plot BEST significance analysis.

    Parameters
    ----------
    trace : pymc3.sampling.BaseTrace, optional
        trace object as returned by model_best()
        If not passed, will run model_best(), for which
        data_train and data_test are required.
    data_train : pandas.Series, optional
        Returns of in-sample period.
        Required if trace=None.
    data_test : pandas.Series, optional
        Returns of out-of-sample period.
        Required if trace=None.
    samples : int, optional
        Posterior samples to draw.
    burn : int
        Posterior sampels to discard as burn-in.
    axs : array of matplotlib.axes objects, optional
        Plot into passed axes objects. Needs 6 axes.

    Returns
    -------
    None

    See Also
    --------
    model_best : Estimation of BEST model.
    """
    if trace is None:
        if (data_train is not None) or (data_test is not None):
            raise ValueError('Either pass trace or data_train and data_test')
        trace = model_best(data_train, data_test, samples=samples)

    trace = trace[burn:]
    if axs is None:
        fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(16, 4))

    def distplot_w_perc(trace, ax):
        sns.distplot(trace, ax=ax)
        ax.axvline(stats.scoreatpercentile(trace, 2.5),
                   color='0.5',
                   label='2.5 and 97.5 percentiles')
        ax.axvline(stats.scoreatpercentile(trace, 97.5), color='0.5')

    sns.distplot(trace['group1_mean'], ax=axs[0], label='backtest')
    sns.distplot(trace['group2_mean'], ax=axs[0], label='forward')
    axs[0].legend(loc=0)
    axs[1].legend(loc=0)

    distplot_w_perc(trace['difference of means'], axs[1])

    axs[0].set(xlabel='mean', ylabel='belief', yticklabels=[])
    axs[1].set(xlabel='difference of means', yticklabels=[])

    sns.distplot(trace['group1_annual_volatility'],
                 ax=axs[2],
                 label='backtest')
    sns.distplot(trace['group2_annual_volatility'], ax=axs[2], label='forward')
    distplot_w_perc(
        trace['group2_annual_volatility'] - trace['group1_annual_volatility'],
        axs[3])
    axs[2].set(xlabel='Annual volatility', ylabel='belief', yticklabels=[])
    axs[2].legend(loc=0)
    axs[3].set(xlabel='difference of volatility', yticklabels=[])

    sns.distplot(trace['group1_sharpe'], ax=axs[4], label='backtest')
    sns.distplot(trace['group2_sharpe'], ax=axs[4], label='forward')
    distplot_w_perc(trace['group2_sharpe'] - trace['group1_sharpe'], axs[5])
    axs[4].set(xlabel='Sharpe', ylabel='belief', yticklabels=[])
    axs[4].legend(loc=0)
    axs[5].set(xlabel='difference of Sharpes', yticklabels=[])

    sns.distplot(trace['effect size'], ax=axs[6])
    axs[6].axvline(stats.scoreatpercentile(trace['effect size'], 2.5),
                   color='0.5')
    axs[6].axvline(stats.scoreatpercentile(trace['effect size'], 97.5),
                   color='0.5')
    axs[6].set(xlabel='difference of means normalized by volatility',
               ylabel='belief',
               yticklabels=[])
Esempio n. 48
0
    def plot_tags_per_basepair(self,
                               data,
                               labels,
                               title='',
                               xlabel='',
                               ylabel='',
                               window_len=100,
                               ymax_percentile=99.5,
                               tag_scalars=None,
                               show_moving_average=True,
                               show_count=False):
        '''
        Given a list of data frames with cols basepair and tag_count, 
        graph each as a line.
        
        '''

        fig = pyplot.figure(figsize=[12, 6])
        # Set up plot
        ax = pyplot.subplot(111)
        ax.set_xlim([self.from_bp, self.to_bp])

        all_y_vals = []
        colors = self.get_colors(len(data))
        if show_moving_average:
            for i, dataset in enumerate(data):
                try:
                    dataset[
                        'tag_count'] = dataset['tag_count'] * tag_scalars[i]
                except TypeError:
                    dataset['tag_count'] = dataset['tag_count'] * (tag_scalars
                                                                   or 1)

                all_y_vals.extend(dataset['tag_count'])
                pyplot.plot(dataset['basepair'],
                            dataset['tag_count'],
                            '.',
                            markeredgecolor=colors[i],
                            markerfacecolor='None',
                            alpha=.2,
                            markeredgewidth=.5)

        # Another loop, since we want all the lines above all the circles
        for i, dataset in enumerate(data):
            # Graph fit line
            line_type = i % 2 and '--' or '-'
            if show_moving_average:
                x, y = self.smooth(dataset['basepair'],
                                   dataset['tag_count'],
                                   window_len=window_len)
            else:
                x, y = dataset['basepair'], dataset['tag_count']
            pyplot.plot(x,
                        y,
                        line_type,
                        color=colors[i],
                        label=labels[i],
                        linewidth=2)
            if show_count:
                pyplot.text(.05,
                            .9 - i * .05,
                            'Tag count: {0}'.format(sum(dataset['tag_count'])),
                            color=colors[i],
                            transform=ax.transAxes)

        # Limit yaxis by percentile if desired:
        if show_moving_average and ymax_percentile:
            ymax = stats.scoreatpercentile(all_y_vals, ymax_percentile)
            ax.set_ylim([0, int(math.ceil(ymax))])

        pyplot.legend()
        self.add_title(title or 'Tag counts around transcription start sites',
                       ax)
        self.add_axis_labels(xlabel or 'Basepairs from TSS', ylabel
                             or 'Normalized number of tag starts')

        return ax
Esempio n. 49
0
def iqr(a):
    """Calculate the IQR for an array of numbers."""
    a = np.asarray(a)
    q1 = stats.scoreatpercentile(a, 25)
    q3 = stats.scoreatpercentile(a, 75)
    return q3 - q1
Esempio n. 50
0
 def distplot_w_perc(trace, ax):
     sns.distplot(trace, ax=ax)
     ax.axvline(stats.scoreatpercentile(trace, 2.5),
                color='0.5',
                label='2.5 and 97.5 percentiles')
     ax.axvline(stats.scoreatpercentile(trace, 97.5), color='0.5')
Esempio n. 51
0
    clf.fit(X_train)

    # predict raw anomaly score
    scores_pred = clf.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != Y_train).sum()
    print('No of Errors : ',clf_name, n_errors)

    # rest of the code is to create the visualization

    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 *outlier_fraction)

    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)

    subplot = plt.subplot(1, 2, i + 1)

    # fill blue colormap from minimum anomaly score to threshold value
    subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 10),cmap=plt.cm.Blues_r)

    # draw red contour line where anomaly score is equal to threshold
    a = subplot.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
Esempio n. 52
0
def high_variance_confounds(series,
                            n_confounds=5,
                            percentile=2.,
                            detrend=True):
    """ Return confounds time series extracted from series with highest
        variance.

        Parameters
        ==========
        series: numpy.ndarray
            Timeseries. A timeseries is a column in the "series" array.
            shape (sample number, feature number)

        n_confounds: int, optional
            Number of confounds to return

        percentile: float, optional
            Highest-variance series percentile to keep before computing the
            singular value decomposition, 0. <= `percentile` <= 100.
            series.shape[0] * percentile / 100 must be greater than n_confounds

        detrend: bool, optional
            If True, detrend timeseries before processing.

        Returns
        =======
        v: numpy.ndarray
            highest variance confounds. Shape: (samples, n_confounds)

        Notes
        ======
        This method is related to what has been published in the literature
        as 'CompCor' (Behzadi NeuroImage 2007).

        The implemented algorithm does the following:

        - compute sum of squares for each time series (no mean removal)
        - keep a given percentile of series with highest variances (percentile)
        - compute an svd of the extracted series
        - return a given number (n_confounds) of series from the svd with
          highest singular values.

        See also
        ========
        nilearn.image.high_variance_confounds
    """

    if detrend:
        series = _detrend(series)  # copy

    # Retrieve the voxels|features with highest variance

    # Compute variance without mean removal.
    var = _mean_of_squares(series)

    var_thr = stats.scoreatpercentile(var, 100. - percentile)
    series = series[:, var > var_thr]  # extract columns (i.e. features)
    # Return the singular vectors with largest singular values
    # We solve the symmetric eigenvalue problem here, increasing stability
    s, u = linalg.eigh(series.dot(series.T) / series.shape[0])
    ix_ = np.argsort(s)[::-1]
    u = u[:, ix_[:n_confounds]].copy()
    return u
Esempio n. 53
0
def high_variance_confounds(series,
                            n_confounds=10,
                            percentile=1.,
                            detrend=True):
    """ Return confounds time series extracted from series with highest
        variance.

        Parameters
        ==========
        series: numpy.ndarray
            Timeseries. A timeseries is a column in the "series" array.
            shape (sample number, feature number)

        n_confounds: int
            Number of confounds to return

        percentile: float
            Highest-variance series percentile to keep before computing the
            singular value decomposition.
            series.shape[0] * percentile must be greater than n_confounds.

        detrend: bool
            If True, detrend timeseries before processing.

        Returns
        =======
        v: numpy.ndarray
            highest variance confounds. Shape: (samples, n_confounds)

        Notes
        ======
        This method is related to what has been published in the literature
        as 'CompCor' (Behzadi NeuroImage 2007).

        The implemented algorithm does the following:
        - compute sum of squares for each time series (no mean removal)
        - keep a given percentile of series with highest variances (percentile)
        - compute an svd of the extracted series
        - return a given number (n_confounds) of series from the svd with
          highest singular values.

        See also
        ========
        nisl.image.high_variance_confounds
    """

    # FIXME: when detrend=True, two copies of "series" are made.  Variance
    # computation below can be made chunk-by-chunk, which uses almost no
    # extra memory, and is as fast (if not faster).
    if detrend:
        series = _detrend(series)  # copy

    # Retrieve the voxels|features with highest variance

    # Compute variance without mean removal.
    var = np.copy(series)
    var **= 2
    var = var.mean(axis=0)

    var_thr = stats.scoreatpercentile(var, 100. - percentile)
    series = series[:, var > var_thr]  # extract columns (i.e. features)
    # Return the singular vectors with largest singular values
    u, _, _ = linalg.svd(series, full_matrices=False)
    u = u[:, :n_confounds].copy()
    return u
Esempio n. 54
0
def ecgmmRidge(ra_c=None,
               dec_c=None,
               photoz_c=None,
               r200_c=None,
               m200_c=None,
               ra=None,
               dec=None,
               color=None,
               colorErr=None,
               mag=None,
               candidateIdx=None):
    #--define some quantity to be returned ----
    rac = ra_c[candidateIdx]
    decc = dec_c[candidateIdx]
    photozc = photoz_c[candidateIdx]
    r200c = r200_c[candidateIdx]
    m200c = m200_c[candidateIdx]
    ok = (color >= -1) * (color <= 5.)
    color = color[ok]
    colorErr = colorErr[ok]
    #colorErr = np.zeros(len(color))
    mag = mag[ok]
    ra = ra[ok]
    dec = dec[ok]
    BCGalpha0 = []
    BCGalpha1 = []
    BCGmu0 = []
    BCGmu1 = []
    BCGsigma0 = []
    BCGsigma1 = []
    BCGntot = []
    BCGamp = []
    BCGaic1 = []
    BCGaic2 = []
    BCGphotoz = []
    BCGm200c = []
    #-----------------------------------------
    Ncandidates = len(photozc)
    ridgeZ = np.zeros(Ncandidates)
    depth = 10
    h = es.htm.HTM(depth)
    Cosmo = es.cosmology.Cosmo(h=0.7)
    DA = Cosmo.Da(0, photozc)
    srad = np.rad2deg(r200c / DA)
    m1, m2, d12 = h.match(rac, decc, ra, dec, srad, maxmatch=5000)
    r12 = np.deg2rad(d12) * DA[m1]
    indices = (mag[m2] <= limz(photozc[m1]))  # no bcg assumed
    m1 = m1[indices]
    m2 = m2[indices]
    h, rev = es.stat.histogram(m1, binsize=1, rev=True)
    startTime = time.time()
    for i in range(h.size):
        if rev[i] != rev[i + 1]:
            print i
            indx = rev[rev[i]:rev[i + 1]]
            alpha = np.array([0.5, 0.5])
            mu = np.array([
                sts.scoreatpercentile(color[m2[indx]], per=70),
                sts.scoreatpercentile(color[m2[indx]], per=40)
            ])
            sigma = np.array([0.04, 0.3])
            aic2 = gmm.aic_ecgmm(color[m2[indx]], colorErr[m2[indx]], alpha,
                                 mu, sigma)
            aic1 = gmm.wstat(color[m2[indx]], colorErr[m2[indx]])[2]
            if aic2 < aic1:
                srt = np.argsort(sigma)
                BCGalpha0.append(alpha[srt[0]])
                BCGalpha1.append(alpha[srt[1]])
                BCGmu0.append(mu[srt[0]])
                BCGmu1.append(mu[srt[1]])
                BCGsigma0.append(sigma[srt[0]])
                BCGsigma1.append(sigma[srt[1]])
                BCGaic1.append(aic1)
                BCGaic2.append(aic2)
                BCGamp.append(len(indx) * alpha[srt[0]])
                BCGphotoz.append(photozc[m1[indx[0]]])
                BCGm200c.append(m200c[m1[indx[0]]])
                print aic2, aic1
    endTime = time.time()
    elapseTime = endTime - startTime
    print '---elapsed time: ' + str(elapseTime)
    return np.array(BCGalpha0), np.array(BCGalpha1), np.array(
        BCGmu0), np.array(BCGmu1), np.array(BCGsigma0), np.array(
            BCGsigma1), np.array(BCGaic1), np.array(BCGaic2), np.array(
                BCGamp), np.array(BCGphotoz), np.array(BCGm200c)
Esempio n. 55
0
 def test_2D(self):
     x = array([[1, 1, 1], [1, 1, 1], [4, 4, 3], [1, 1, 1], [1, 1, 1]])
     assert_array_equal(stats.scoreatpercentile(x, 50), [1, 1, 1])
    def learn_view(self,
                   X_view,
                   words_view,
                   joint_from_view_index,
                   C=1.0,
                   aperture=0.90,
                   aperture_type='probability',
                   update_joint=True,
                   sample_weight=1):

        initial_test_filter = np.empty(shape=(len(words_view), ), dtype=bool)

        for word_id in xrange(len(words_view)):
            y = self.data["y_lookup_init"][joint_from_view_index[word_id]]
            initial_test_filter[word_id] = (y != -1)

        # extend all the variables appropriately

        X_view_w = X_view
        words_view_w = words_view
        joint_from_view_index_w = joint_from_view_index

        # print "X VIEW"
        # print X_view.get_shape()
        # print X_view
        # print "X VIEW WINDOW - pre"
        # print X_view_w.get_shape()
        # print X_view_w

        initial_test_filter = initial_test_filter.nonzero()[0]

        # print "initial test filter"
        # print initial_test_filter
        # print len(initial_test_filter)

        for i in range(sample_weight - 1):
            X_view_w = vstack((X_view_w, X_view[initial_test_filter]),
                              format="csr")
            words_view_w = np.concatenate(
                (words_view_w, words_view[initial_test_filter]))
            joint_from_view_index_w = np.concatenate(
                (joint_from_view_index_w,
                 joint_from_view_index[initial_test_filter]))

        # print "X VIEW WINDOW - post"
        # print X_view_w.get_shape()
        # print X_view_w

        pred_view_w = np.empty(
            shape=(len(words_view_w), ),
            dtype=int)  # make a new empty vector for predicted values
        # (pred_view is predicted population sizes; not true/false)

        # print self.pred_joint

        # create answer vectors with the seed answers
        for word_id in xrange(len(pred_view_w)):
            pred_view_w[word_id] = self.pred_joint[
                joint_from_view_index_w[word_id]]

        y_view_w = (pred_view_w
                    == words_view_w) * 2 - 1  # set Trues to 1 and Falses to -1

        # set filter vectors (-1 = unknown)

        filter_train = (pred_view_w != -1).nonzero()[0]
        filter_test = (pred_view_w == -1).nonzero()[0]

        # print filter_train, len(filter_train)
        # print filter_train, len(filter_test)

        # self.metrics["cochrane_training_examples"].append(len(filter_train))
        # self.metrics["cochrane_test_examples"].append(len(filter_test))

        if len(filter_test) == 0:
            print "leaving early - run out of data!"
            raise IndexError("out of data")

        # set training vectors
        X_train = X_view_w[filter_train]
        y_train = y_view_w[filter_train]

        # and test vectors as the rest
        X_test = X_view_w[filter_test]
        y_test = y_view_w[filter_test]

        # and the numbers to go with it for illustration purposes
        words_test = words_view_w[filter_test]
        joint_from_view_index_test = joint_from_view_index_w[filter_test]

        # make and fit new LR model
        # model = LogisticRegression(C=C, penalty='l1')
        model = self.model(C=C)
        logging.debug("fitting model to cochrane data...")
        model.fit(X_train, y_train)

        if update_joint:

            preds = model.predict_proba(X_test)[:, 1]  # predict unknowns

            # get top results (by aperture type selected)
            if aperture_type == "percentile":
                top_pc_score = stats.scoreatpercentile(preds, aperture)
                top_result_indices = (preds > top_pc_score).nonzero()[0]
            elif aperture_type == "absolute":
                top_result_indices = np.argsort(preds)[-aperture:]
            else:
                top_pc_score = aperture
                top_result_indices = (preds > top_pc_score).nonzero()[0]

            # extend the joint predictions
            for i in top_result_indices:
                self.pred_joint[joint_from_view_index_test[i]] = words_test[i]

        return model
Esempio n. 57
0
    def run_spherical_gNFW(self,
                           par,
                           plot=False,
                           save=True,
                           path='./',
                           fname='single_rst',
                           vmap='map',
                           markersize=0.5,
                           rDot=0.24):
        print('--------------------------------------------------')
        print('Run spherical gNFW model with given parameters')
        model['lnprob'] = lnprob_spherical_gNFW
        model['type'] = 'spherical_gNFW'
        model['ndim'] = 6
        model['JAMpars'] = ['cosinc', 'beta', 'ml', 'logrho_s', 'rs', 'gamma']
        # initialize the JAM class and pass to the global parameter
        model['JAM'] = \
            pyjam.axi_rms.jam(model['lum2d'], model['pot2d'],
                              model['distance'],
                              model['xbin'], model['ybin'], mbh=model['bh'],
                              quiet=True, sigmapsf=model['sigmapsf'],
                              pixsize=model['pixsize'], nrad=model['nrad'],
                              shape=model['shape'])
        rmsModel = lnprob_spherical_gNFW(par, True, False)
        xbin = model['xbin']
        ybin = model['ybin']
        rms = self.rms
        errRms = self.errRms
        goodbins = self.goodbins
        chi2 = np.sum(
            ((rms[goodbins] - rmsModel[goodbins]) / errRms[goodbins])**2)
        chi2_dof = chi2 / goodbins.sum()
        for i in range(len(par)):
            print('{}: {:.4f}'.format(model['JAMpars'][i], par[i]))
        print('chi2: {:.4f}'.format(chi2))
        print('chi2/dof: {:.4f}'.format(chi2_dof))
        print('--------------------------------------------------')

        rst = {
            'xbin': xbin,
            'ybin': ybin,
            'rms': rms,
            'errRms': errRms,
            'goodbins': goodbins,
            'rmsModel': rmsModel,
            'chi2': chi2,
            'chi2_dof': chi2_dof,
            'pars': par
        }
        if save:
            with open('{}/{}.dat'.format(path, fname), 'wb') as f:
                pickle.dump(rst, f)
        if plot:
            fig = plt.figure(figsize=(18 / 1.5, 5. / 1.5))
            axes0a = fig.add_subplot(131)
            axes0b = fig.add_subplot(132)
            axes0c = fig.add_subplot(133)
            fig.subplots_adjust(left=0.05,
                                bottom=0.1,
                                right=0.92,
                                top=0.99,
                                wspace=0.4)
            vmin, vmax = stats.scoreatpercentile(rms[goodbins], [0.5, 99.5])
            norm = colors.Normalize(vmin=vmin, vmax=vmax)
            velocity_plot(xbin,
                          ybin,
                          rms,
                          ax=axes0b,
                          text='$\mathbf{V_{rms}: Obs}$',
                          size=rDot,
                          norm=norm,
                          vmap=vmap,
                          markersize=markersize)
            velocity_plot(xbin,
                          ybin,
                          rmsModel,
                          ax=axes0a,
                          text='$\mathbf{V_{rms}: JAM}$',
                          size=rDot,
                          norm=norm,
                          bar=False,
                          vmap=vmap,
                          markersize=markersize)
            residualValue = rmsModel - rms
            vmax = \
                stats.scoreatpercentile(abs(residualValue[goodbins])
                                        .clip(-100, 100.), 99.5)
            norm_residual = colors.Normalize(vmin=-vmax, vmax=vmax)
            velocity_plot(xbin,
                          ybin,
                          residualValue,
                          ax=axes0c,
                          text='$\mathbf{Residual}$',
                          size=rDot,
                          norm=norm_residual,
                          vmap=vmap,
                          markersize=markersize)
            fig.savefig('{}/{}.png'.format(path, fname), dpi=300)
        return rst
Esempio n. 58
0
# Perform outlier detection
predicted_data = clf.predict(data)
inlier_predicted_data = data[predicted_data == 1]
outlier_predicted_data = data[predicted_data == -1]
num_inliers_predicted = inlier_predicted_data.shape[0]
num_outliers_predicted = outlier_predicted_data.shape[0]

# Plot decision function values
xr = np.linspace(3, 10, 500)
yr = np.linspace(-5, 45, 500)
xx, yy = np.meshgrid(xr, yr)
zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
zz = zz.reshape(xx.shape)
scores = clf.decision_function(data)
threshold = stats.scoreatpercentile(scores, 100 * contamination)
plt.contourf(xx,
             yy,
             zz,
             levels=np.linspace(zz.min(), threshold, 7),
             cmap=plt.cm.Blues_r)  # Outlier
plt.contour(xx,
            yy,
            zz,
            levels=np.array([threshold]),
            linewidths=2,
            colors="red")  # The frontier
plt.contourf(xx,
             yy,
             zz,
             levels=np.linspace(threshold, zz.max(), 7),
Esempio n. 59
0
 def fit(self, X, y):
     self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0)
Esempio n. 60
0
 def test_percentile(self):
     x = arange(8) * 0.5
     assert_equal(stats.scoreatpercentile(x, 0), 0.)
     assert_equal(stats.scoreatpercentile(x, 100), 3.5)
     assert_equal(stats.scoreatpercentile(x, 50), 1.75)