Esempio n. 1
0
def ks_metric(y_true, y_scores, bins, path):
    '''
    :param y_true: array-like of shape = (n_samples) or (n_samples, n_outputs)
    Ground truth (correct) target values.
    :param y_scores: array-like of shape = (n_samples) or (n_samples, n_outputs)
    Estimated target values.
    :param bins: bins of y_scores
    :param path: if path equal 0, show ks plot; if path equal string of filepath, ks plot will
    save to filepath.
    :return: ks value
    '''
    df = pd.DataFrame({'y': y_true,
                       'score': y_scores})
    cdf_data1 = df[df['y'] == 0]['score']
    cdf_data2 = df[df['y'] == 1]['score']
    cdf1 = stats.cumfreq(cdf_data1, numbins=bins)
    cdf2 = stats.cumfreq(cdf_data2, numbins=bins)
    y_0 = cdf1[0] / cdf1[0][-1]
    y_1 = cdf2[0] / cdf2[0][-1]
    cdf_data = pd.DataFrame({'y_0': y_0, 'y_1': y_1})

    # plot
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(cdf_data)
    ax.legend(list(cdf_data.columns))
    plt.ylabel('累积概率')
    plt.xlabel('预测得分')
    if path == 0:
        plt.show()
    else:
        plt.savefig(path, dpi=150)

    # KS值
    ks = np.max(cdf1[0] / cdf1[0][-1] - cdf2[0] / cdf2[0][-1])
    return ks
Esempio n. 2
0
def plotCDF(forecast, validation, title):
    ax1 = plt.figure(figsize=(7, 5))
    vals, x1, x2, x3 = cumfreq(forecast['modelled'], len(forecast['modelled']))
    ax1 = plt.plot(np.linspace(np.min(forecast['modelled']),
                               np.max(forecast['modelled']),
                               len(forecast['modelled'])),
                   vals / len(forecast['modelled']),
                   "r",
                   label=str(config.get('Main options', 'RunName')))
    vals, x1, x2, x3 = cumfreq(validation['modelled'],
                               len(validation['modelled']))
    ax2 = plt.plot(np.linspace(np.min(validation['modelled']),
                               np.max(validation['modelled']),
                               len(validation['modelled'])),
                   vals / len(validation['modelled']),
                   "b",
                   label=str(config.get('Reference options', 'RunName')))
    vals, x1, x2, x3 = cumfreq(validation['observations'],
                               len(validation['observations']))
    ax3 = plt.plot(np.linspace(np.min(validation['observations']),
                               np.max(validation['observations']),
                               len(validation['observations'])),
                   vals / len(validation['observations']),
                   "black",
                   label="Observations")
    ax3 = plt.legend(prop={'size': 10}, loc=2)
    ax1 = plt.title(title)
    ax1 = plt.xlabel("Discharge (m3/s)")
    ax1 = plt.ylabel("ECDF")
    ax1 = plt.gcf().set_tight_layout(True)
    pdf.savefig()
    plt.clf()
def HM_color_transfer(style, content):
    """
        Color transfer the content image to the style image using cumulative
        distribution of both images,
        Args:
            style: target style in RGB space.
            content: content image in RGB space.

        Returns:
            Color transfer image in RGB space.
    """
    #copy style image and content image then convert them from 0:1 to 0:255 scale.
    transfered = np.copy(content)
    style = np.copy(style)
    transfered *=255
    style *= 255

    #calculate normalized cumulative histogram then update the content image based on the calculated values.
    for h in range (0,3):
        content_c = transfered[:,:,h]
        style_c   = style[:,:,h]
        height , width = content_c.shape
        contentValues,_,_,_ = stats.cumfreq(content_c, numbins=256)
        contentValues /= contentValues[-1]

        styleValues,_,_,_   = stats.cumfreq(style_c, numbins=256)
        styleValues /= styleValues[-1]

        K=256
        new_values=np.zeros((K))

        for a in np.arange(K):
            j=K-1
            while True:
                new_values[a]=j
                j=j-1
                if j<0 or contentValues[a]>styleValues[j]:
                    break

        for i in np.arange(height):
            for j in np.arange(width):
                a=content_c.item(i,j)
                b=new_values[int(a)]
                transfered[:,:,h].itemset((i,j),b)
        #transfered[:,:,h] = gaussian(transfered[:,:,h])
    #return the image to 0:1 scale
    transfered = transfered /255
    return transfered
Esempio n. 4
0
def compare_cdfs(data, A, num_bins=100):
    cdfs = {}
    assert len(np.unique(A)) == 2

    limits = (min(data), max(data))
    s = 0.5 * (limits[1] - limits[0]) / (num_bins - 1)
    limits = (limits[0] - s, limits[1] + s)

    for a in np.unique(A):
        subset = data[A == a]

        cdfs[a] = cumfreq(subset, numbins=num_bins, defaultreallimits=limits)

    lower_limits = [v.lowerlimit for _, v in cdfs.items()]
    bin_sizes = [v.binsize for _, v in cdfs.items()]
    actual_num_bins = [v.cumcount.size for _, v in cdfs.items()]

    assert len(np.unique(lower_limits)) == 1
    assert len(np.unique(bin_sizes)) == 1
    assert np.all([num_bins == v.cumcount.size for _, v in cdfs.items()])

    xs = lower_limits[0] + np.linspace(0, bin_sizes[0] * num_bins, num_bins)

    disparities = np.zeros(num_bins)
    for i in range(num_bins):
        cdf_values = np.clip(
            [v.cumcount[i] / len(data[A == k]) for k, v in cdfs.items()], 0, 1)
        disparities[i] = max(cdf_values) - min(cdf_values)

    return xs, cdfs, disparities
Esempio n. 5
0
def visualize_cumulative_sum():
    ssandtss = [ss_mins,tss_mins,ss3end_mins]
    plt.rcParams["font.size"] = 16
    for index in range(3):
        plt.figure()
        dists=[]
        for dis in ssandtss[index]:
            if abs(int(dis)) > args.xlimit:
                dists.append(args.xlimit+2)
                continue
            dists.append(abs(int(dis)))
        cums = stats.cumfreq(dists,numbins=args.xlimit+2)
        plt.xlabel('distance [bp]')
        x = pd.Series(cums.cumcount)
        plt.xlim(0,args.xlimit)
        ax = sns.lineplot(data=x)
        if index==0:
            plt.ylabel('Numper of splice sites')
            plt.savefig("ss_cumulative_plot.png", dpi=500, bbox_inches='tight')
            label="splice site"
        elif index==1:
            plt.ylabel('Numper of TSSs')
            plt.savefig("tss_cumulative_plot.png", dpi=500,bbox_inches='tight')
        else:
            plt.ylabel('Numper of 3-prime ends')
            plt.savefig("ss3end_cumulative_plot.png", dpi=500,bbox_inches='tight')
Esempio n. 6
0
def create_cdf(X):
	
	"""Create the cummulative density function of a continuous random
	variable, e.g. observed data.
	
	arguments

	X	-	Observed data from which a cdf should be constructed; please
			provide the data as a vector (Nx1 NumPy array or list).
	
	returns
	
	(bins, cdf)

	bins	-	All unique values in X, used as bins for the cdf.

	cdf	-	The cummulative density for each value in bins.
	"""
	
	# convert the data to a NumPy array.
	data = numpy.array(X)
	# the bins are all unique values in the data
	bins = copy.deepcopy(data)
	bins.sort()
	# calculate the cummulative frequency for each unique value in the data
	cumfreq, lowerlim, binsize, extra = stats.cumfreq(data, numbins=len(bins))
	# transform the cummulative frequencies to a cdf
	cdf = cumfreq / numpy.max(cumfreq)

	return bins, cdf
Esempio n. 7
0
def cumdist(vec, nbins=100):
    hist(vec, color='g', bins=nbins, normed=True, align='mid')
#    hist(vec, bins=nbins, normed=False, align='mid')
#    figure(2)    
    disc = cumfreq(vec, numbins=nbins)
    plot(disc[0]/len(vec)) 
    show()
Esempio n. 8
0
def plot_percent_percentile_plot(test_pred_do, test_y1):
    test_pred_do = np.swapaxes(test_pred_do, 0, 1)
    percentile = np.zeros((test_pred_do.shape[0], test_pred_do.shape[1]))
    z_score = np.zeros_like(percentile)
    for i in range(test_pred_do.shape[0]):
        for j in range(test_pred_do.shape[1]):
            if test_y1[i, j, 1] == 0:
                percentile[i, j] = np.nan
                z_score[i, j] = np.nan
                continue
            temp = np.append(test_pred_do[i, j, :], test_y1[i, j, 0])
            temp = np.sort(temp)
            ix = np.where(temp == test_y1[i, j, 0])
            percentile[i, j] = ix[0][0] / (len(temp) - 1) * 100
            z_score[i, j] = (test_y1[i, j, 0] - np.mean(
                test_pred_do[i, j, :])) / np.std(test_pred_do[i, j, :])

    mask = test_y1[:, :, 1].reshape((-1, ))
    ix = np.where(mask == 1)
    percentile = percentile.reshape((-1, ))[ix]
    #pyplot.figure();
    #pyplot.hist(percentile,bins=100);
    res = stats.cumfreq(percentile, numbins=100)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size,
                                     res.cumcount.size)
    pyplot.figure()
    pyplot.bar(x,
               res.cumcount / np.count_nonzero(mask) * 100,
               width=res.binsize)
    pyplot.plot(x, x, '-r', label='y=x')
    pyplot.show()
    return z_score
Esempio n. 9
0
def KS_principle(inData):
    '''Show the principle of the Kolmogorov-Smirnov test.'''

    # CDF of normally distributed data
    nd = stats.norm()
    nd_x = np.linspace(-4, 4, 101)
    nd_y = nd.cdf(nd_x)

    # Empirical CDF of the sample data, which range for approximately 0 to 10
    numPts = 50
    lowerLim = 0
    upperLim = 10
    ecdf_x = np.linspace(lowerLim, upperLim, numPts)
    ecdf_y = stats.cumfreq(data, numPts, (lowerLim, upperLim))[0] / len(inData)

    #Add zero-point by hand
    ecdf_x = np.hstack((0., ecdf_x))
    ecdf_y = np.hstack((0., ecdf_y))

    # Plot the data
    sns.set_style('ticks')
    sns.set_context('poster')
    setFonts(36)

    plt.plot(nd_x, nd_y, 'k--')
    plt.hold(True)
    plt.plot(ecdf_x, ecdf_y, color='k')
    plt.xlabel('X')
    plt.ylabel('Cumulative Probability')

    # For the arrow, find the start
    ecdf_startIndex = np.min(np.where(ecdf_x >= 2))
    arrowStart = np.array([ecdf_x[ecdf_startIndex], ecdf_y[ecdf_startIndex]])

    nd_startIndex = np.min(np.where(nd_x >= 2))
    arrowEnd = np.array([nd_x[nd_startIndex], nd_y[nd_startIndex]])
    arrowDelta = arrowEnd - arrowStart

    plt.arrow(arrowStart[0],
              arrowStart[1],
              0,
              arrowDelta[1],
              width=0.05,
              length_includes_head=True,
              head_length=0.04,
              head_width=0.4,
              color='k')

    plt.arrow(arrowStart[0],
              arrowStart[1] + arrowDelta[1],
              0,
              -arrowDelta[1],
              width=0.05,
              length_includes_head=True,
              head_length=0.04,
              head_width=0.4,
              color='k')

    outFile = 'KS_Example.png'
    showData(outFile)
Esempio n. 10
0
def cdf_vals_from_data(data, numbins=None, maxbins=None):

    # make sure data is a numpy array
    data = numpy.array(data)
    
    # by default, use numbins equal to number of distinct values
    # TODO: shouldn't this be one per possible x val?
    if numbins == None:
        numbins = numpy.unique(data).size

    if maxbins != None and numbins > maxbins:
        numbins = maxbins
    
    # bin the data and count fraction of points in each bin (for PDF)
    rel_bin_counts, min_bin_x, bin_size, _ =\
        stats.relfreq(data, numbins, (data.min(), data.max()))
    
    # bin the data and count each bin (cumulatively) (for CDF)
    cum_bin_counts, min_bin_x, bin_size, _ =\
        stats.cumfreq(data, numbins, (data.min(), data.max()))

    # normalize bin counts so rightmost count is 1
    cum_bin_counts /= cum_bin_counts.max()

    # make array of x-vals (lower end of each bin)
    x_vals = numpy.linspace(min_bin_x, min_bin_x+bin_size*numbins, numbins)

    # CDF always starts at y=0
    cum_bin_counts = numpy.insert(cum_bin_counts, 0, 0)  # y = 0
    cdf_x_vals = numpy.insert(x_vals, 0, x_vals[0])  # x = min x


    return cum_bin_counts, cdf_x_vals, rel_bin_counts, x_vals
Esempio n. 11
0
def cdfs(valueses, xlabel='value', labels=None, title='CDF', n_bins=500):
    """
    Plot one or more cumulative density functions
    :param valueses:
    :param xlabel:
    :param labels:
    :param title:
    :param n_bins:
    :return:
    """
    x_valueses = []
    y_valueses = []
    logger.debug("cdfs")
    for values in valueses:
        freq = cumfreq(values, n_bins)
        x_values = [
            freq.lowerlimit + x * freq.binsize for x in xrange(0, n_bins)
        ]
        y_values = freq.cumcount / len(values)
        logger.debug("binsize: %f" % freq.binsize)
        logger.debug("range: %f" % (freq.binsize * n_bins))
        logger.debug("y range: %f - %f" % (min(y_values), max(y_values)))
        x_valueses.append(x_values)
        y_valueses.append(y_values)

    return multiline(x_valueses,
                     y_valueses,
                     title=title,
                     xlabel=xlabel,
                     ylabel='density',
                     labels=labels)
Esempio n. 12
0
    def plotRECCurve(self, nbins=20, highlight_error=None, linestyle='-',
                     linewidth=1.0):
        """
        Plot a Regression Error Characteristic (REC) curve.

        The resulting REC curve shows the cumulative distribution of errors
        over the dataset, where the error is measured in distance of the mode
        of the mixture distribution from the target value in standard
        deviations.

        TODO: Use the true mode rather than the kernel with the largest mixing
        coefficient.
        """
        if self.y == None:
            self.update()
        alpha, sigma2, mu = mdn.getMixtureParams(self.y, self.module.M, self.module.c)
        #maxidxs = np.argmax(alpha, axis=1)
        maxidxs=self.getMaxKernel(alpha, sigma2)
        N=len(mu)
        mu = mu[np.arange(0,N), maxidxs]
        sigma2 = sigma2[np.arange(0,N), maxidxs]
        dist = np.sum(np.abs(mu-self.tgts), axis=1)
        dist /= np.sqrt(sigma2)
        h,_,_,_ = cumfreq(dist, nbins,[0,10])
        h/=N
        plt.plot(np.linspace(0,10,nbins), h, linestyle, linewidth=linewidth)
        if highlight_error:
            plt.vlines(highlight_error, 0, 1, linestyles='-.')
        plt.xlabel('$\epsilon$ [n std deviations]')
        plt.ylabel('accuracy')
        return dist
Esempio n. 13
0
def main():
    # Univariate data -------------------------
    # Generate data that are normally distributed
    x = randn(500)
    
    # Set the fonts the way I like them
    sns.set_context('poster')
    sns.set_style('ticks')
    #mystyle.set()
    
    # Scatter plot
    scatter(arange(len(x)), x)
    xlim([0, len(x)])
    mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter')
    
    # Histogram
    hist(x)
    mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings')
    
    hist(x,25)
    mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins')
    
    # Cumulative probability density
    numbins = 20
    plot(stats.cumfreq(x,numbins)[0])
    mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='Cumulative Frequency')
    
    # Boxplot
    # The ox consists of the first, second (middle) and third quartile
    boxplot(x, sym='*')
    mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot')
    
    boxplot(x, sym='*', vert=False)
    title('Boxplot, horizontal')
    xlabel('Values')
    show()
    
    # Errorbars
    x = arange(5)
    y = x**2
    errorBar = x/2
    errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3)
    xlim([-0.2, 4.2])
    ylim([-0.2, 19])
    mystyle.printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars')
    
    # Violinplot
    nd = stats.norm
    data = nd.rvs(size=(100))
    
    nd2 = stats.norm(loc = 3, scale = 1.5)
    data2 = nd2.rvs(size=(100))
    
    # Use pandas and the seaborn package for the violin plot
    df = pd.DataFrame({'Girls':data, 'Boys':data2})
    #sns.violinplot(df, color = ["#999999", "#DDDDDD"])
    sns.violinplot(df)
    
    mystyle.printout('violinplot.png')
def main():
    # Univariate data -------------------------
    # Generate data that are normally distributed
    x = randn(500)

    # Set the fonts the way I like them
    sns.set_context('paper')
    sns.set_style('white')
    mystyle.set()

    # Scatter plot
    plot(x, '.')
    mystyle.printout('scatterPlot.png',
                     xlabel='x',
                     ylabel='y',
                     title='Scatter')

    # Histogram
    hist(x, color='#999999')
    mystyle.printout('histogram_plain.png',
                     xlabel='Data Values',
                     ylabel='Frequency',
                     title='Histogram, default settings')

    hist(x, 25, color='#999999')
    mystyle.printout('histogram.png',
                     xlabel='Data Values',
                     ylabel='Frequency',
                     title='Histogram, 25 bins')

    # Cumulative probability density
    numbins = 20
    plot(stats.cumfreq(x, numbins)[0])
    mystyle.printout('CumulativeFrequencyFunction.png',
                     xlabel='Data Values',
                     ylabel='Cumulative Freuqency')

    # Boxplot
    # The ox consists of the first, second (middle) and third quartile
    boxplot(x, sym='*')
    mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot')

    boxplot(x, sym='*', vert=False)
    title('Boxplot, horizontal')
    xlabel('Values')
    show()

    # Violinplot
    nd = stats.norm
    data = nd.rvs(size=(100))

    nd2 = stats.norm(loc=3, scale=1.5)
    data2 = nd2.rvs(size=(100))

    # Use pandas and the seaborn package for the violin plot
    df = pd.DataFrame({'Girls': data, 'Boys': data2})
    sns.violinplot(df, color=["#999999", "#DDDDDD"])

    mystyle.printout('violinplot.png')
Esempio n. 15
0
def hist_eq(b):
    bf = b.flatten()
    min_, max_ = nanmin(bf), nanmax(bf)
    cumfreqs, lowlim, binsize, extrapoints = cumfreq(bf, numbins=256, defaultreallimits=(min_, max_))
    cumfreqs = (255.99 * cumfreqs / cumfreqs[-1]).astype('u1')

    result = (255.99*(b-min_)/(max_-min_)).clip(0, 255).astype('u1')
    return cumfreqs[result]
Esempio n. 16
0
 def __init__(self, data, numBins=None):
     if not numBins:
         numBins = int(len(data) / 5)
     res = stats.cumfreq(data, numbins=numBins)
     self.cdistr = res.cumcount / len(data)
     self.loLim = res.lowerlimit
     self.upLim = res.lowerlimit + res.binsize * res.cumcount.size
     self.binWidth = res.binsize
Esempio n. 17
0
 def getPercentile(self):
     self.percentile16 = []
     self.percentile84 = []
     for i in range(0, 6):
         res = stats.cumfreq(self.fileData[i], numbins=400)
         self.percentile16.append(
             self.findPercentile(res, 0.16, len(self.fileData[i])))
         self.percentile84.append(
             self.findPercentile(res, 0.84, len(self.fileData[i])))
Esempio n. 18
0
 def frequency(self, frequencyMap):
     arrayOfkeys = []
     weights = []
     for key in frequencyMap:
         arrayOfkeys.append(int(key))
         weights.append(frequencyMap[key])
     cumcout, lowerlimit, binsize, extrapoints = stats.cumfreq(
         arrayOfkeys, numbins=10, weights=weights)
     return cumcout
Esempio n. 19
0
def draw_cdf(e, cap, subplot):
    subplot.set_title(e['dist'].__name__ + ' capacity=' + str(cap))
    samples = sorted(e['dist'](cap))
    res = stats.cumfreq(samples, numbins=cap, defaultreallimits=e['section'])
    x = min(e['section']) + np.linspace(0, res.binsize * res.cumcount.size,
                                        res.cumcount.size)
    subplot.bar(x, res.cumcount / cap, width=res.binsize)
    subplot.set_ylim([0, 1.2])
    subplot.set_xlim([min(e['section']) - 1, max(e['section']) + 1])
Esempio n. 20
0
def main(filename):
    counts = get_data(filename)
    sorted_counts = sorted([v for v in counts.itervalues()])
    cumfreqs, lowlim, binsize, extrapoints = cumfreq(sorted_counts,
                                                        max(sorted_counts))
    norm_cumfreqs = cumfreqs / max(cumfreqs)
    plot.plot(norm_cumfreqs[:500], linewidth=1.5)
    plot.xlabel("mapped reads")
    plot.ylabel("splice junction")
    plot.show()
Esempio n. 21
0
def plot_cdf(hist_list, bins, norm_factor, min_spike_threshold,
             max_spike_threshold, plt_handle):

    res1 = stats.cumfreq(hist_list,
                         numbins=len(bins),
                         defaultreallimits=(min_spike_threshold,
                                            max_spike_threshold))
    total_count = res1.cumcount[-1]
    cum_count = total_count - res1.cumcount
    plt_handle.plot(bins, cum_count * norm_factor)
def plotCDF(forecast, validation, title, xlims=[-1, 1]):
    vals, x1, x2, x3 = cumfreq(forecast, len(forecast))
    ax1 = plt.plot(np.linspace(np.min(forecast), np.max(forecast),
                               len(forecast)),
                   vals / len(forecast),
                   label='Simulation')
    vals, x1, x2, x3 = cumfreq(validation, len(validation))
    ax2 = plt.plot(np.linspace(np.min(validation), np.max(validation),
                               len(validation)),
                   vals / len(validation),
                   label='Reference')
    ax2 = plt.legend(prop={'size': 10})
    ax1 = plt.title(title)
    ax1 = plt.xlabel("Value")
    ax1 = plt.ylabel("ECDF")
    ax1 = plt.xlim(xlims[0], xlims[1])
    ax1 = plt.ylim(0, 1)
    pdf.savefig()
    plt.clf()
Esempio n. 23
0
    def plot(self, weights, row, col, shape, ix):
        full_lstm = np.zeros(shape)
        ix_lstm = np.zeros(shape)
        full_lstm[(row, col)] = weights
        ix_lstm[(row, col)] = 1

        plt.imshow(-ix_lstm, cmap=plt.get_cmap('binary'))
        plt.savefig('{}/{}.png'.format(self.Dir, ix))
        plt.clf()

        ng = shape[-1] // self.nh

        ix_lstm_p = np.reshape(ix_lstm, [-1, self.nh, ng])
        reduce_row = np.sum(ix_lstm_p, axis=(0, -1))
        from scipy import stats
        cumcount, lower, binsize, _ = stats.cumfreq(reduce_row, numbins=30)
        x = lower + np.linspace(0, binsize * cumcount.size, cumcount.size)
        plt.bar(x, cumcount / (len(reduce_row)), width=binsize)
        plt.xlim(0, 400)
        plt.xlabel('Parameters per Neuron')
        plt.ylabel('Cumulative %')

        plt.savefig('{}/cml{}.png'.format(self.Dir, ix))
        plt.clf()

        ng = shape[-1] // self.nh
        input_list = []
        rec_list = []
        for i in range(ng):
            input_list.append(
                np.sum(ix_lstm[:self.ni, i * self.nh:(i + 1) * self.nh]))
            rec_list.append(
                np.sum(ix_lstm[self.ni:, i * self.nh:(i + 1) * self.nh]))

        print("ratio:",
              np.sum(input_list) / np.sum(rec_list), "true_ratio:",
              shape[0] / self.nh - 1)

        inds = np.arange(ng)
        width = 0.35
        p1 = plt.bar(inds, input_list, width)
        p2 = plt.bar(inds, rec_list, width, bottom=input_list)

        if ng == 3:
            plt.xticks(inds, ('R-gate', 'Z-gate', 'O-gate'))

        elif ng == 4:
            plt.xticks(inds, ('I-gate', 'J-gate', 'F-gate', 'O-gate'))

        plt.ylabel('Number of Connections')
        plt.title("Remaining Weights by Gate and Type")
        plt.legend((p1[0], p2[0]),
                   ('Input Parameters', 'Recurrent Parameters'))
        plt.savefig('{}/bar{}.png'.format(self.Dir, ix))
Esempio n. 24
0
        def GetDistributions(sample1, sample2, nbins):

            # For consistency between CDFs
            lower_limit = min(min(sample1), min(sample2))
            upper_limit = max(max(sample1), max(sample2))

            # Create objects (H1, H2) that includes the cumulative frequency and surrounding information
            H1 = stats.cumfreq(sample1,
                               numbins=nbins,
                               defaultreallimits=(lower_limit, upper_limit))
            H2 = stats.cumfreq(sample2,
                               numbins=nbins,
                               defaultreallimits=(lower_limit, upper_limit))

            cumdist1 = H1.cumcount
            cumdist2 = H2.cumcount

            binsize = H1.binsize

            return lower_limit, upper_limit, H1, H2, cumdist1, cumdist2, binsize
Esempio n. 25
0
def cumulativePlot(samples, save_file=None):
    #fig = plt.figure(figsize=(8, 6))
    res = stats.cumfreq(samples, numbins=25)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size,
                                     res.cumcount.size)
    plt.bar(x, res.cumcount / res.cumcount[-1], width=res.binsize)
    plt.title('Cumulative histogram')
    plt.xlim([x.min(), x.max()])
    plt.xlabel("Wind Speed m/s")
    if save_file is not None:
        plt.savefig(save_file, dpi=300, pad_inches=0, bbox_inches='tight')
    plt.show()
def plot_cumulative_frequency(vul_list):
    percent_fixing_commits = sorted([vul['num_fix_commits']/vul['num_release_commits']*100 for vul in vul_list])
    cumulative_frequency = stats.cumfreq(percent_fixing_commits, defaultreallimits = (-1, 101), numbins=len(percent_fixing_commits))

    trace = go.Scatter(
        name = 'Fixing release',
        x = [0] + percent_fixing_commits,
        y = [0] + list(map(lambda w: w/len(percent_fixing_commits) * 100, cumulative_frequency.cumcount))
    )

    layout = go.Layout(
        showlegend = False,
        yaxis = dict(
            title = 'Cumulative Frequency Distribution<br>(Fixing releases)',
            titlefont = dict(size=16),
            range = [0, 100],
            ticksuffix = '%'
        ),
        xaxis = dict(
            title = 'Fixing commits (%)',
            titlefont = dict(size=16),
            range = [0, 20],
            ticksuffix = '%'
        ),
        shapes = [
            dict(
                type = 'line',
                x0 = 14.28,
                x1 = 14.28,
                y0 = 0,
                y1 = 110,
                line = dict(
                    color = 'black',
                    dash = 'dash'
                )
            ),
            dict(
                type = 'line',
                x0 = 0,
                x1 = 110,
                y0 = 91.77,
                y1 = 91.77,
                line = dict(
                    color = 'black',
                    dash = 'dash'
                )
            )
        ]
    )

    fig = go.Figure(data=[trace], layout=layout)
    fig.write_html('cum_freq_dist.html')
    fig.write_image('cum_freq_dist.pdf', height=400, width=600)
def plotCDF(forecast, validation, title, xlims=[-1, 1]):
    forecast[forecast < -1.01] = -1.01
    vals, x1, x2, x3 = cumfreq(forecast, len(forecast))
    ax1 = plt.plot(np.linspace(np.min(forecast), np.max(forecast),
                               len(forecast)),
                   vals / len(forecast),
                   label=str(config.get('Main options', 'RunName')))
    validation[validation < -1.01] = -1.01
    vals, x1, x2, x3 = cumfreq(validation, len(validation))
    ax2 = plt.plot(np.linspace(np.min(validation), np.max(validation),
                               len(validation)),
                   vals / len(validation),
                   label=str(config.get('Reference options', 'RunName')))
    ax2 = plt.legend(prop={'size': 10}, loc=2)
    ax1 = plt.title(title)
    ax1 = plt.xlabel("Value")
    ax1 = plt.ylabel("ECDF")
    ax1 = plt.xlim(xlims[0], xlims[1])
    ax1 = plt.ylim(0, 1)
    ax1 = plt.gcf().set_tight_layout(True)
    pdf.savefig()
    plt.clf()
Esempio n. 28
0
def iecdf(x, p, nbins=10):
    """f = iecdf(x, p, nbins=10) returns the reciprocal of the empirical cumulative distriution function at ordinate p
    """
    # if (p > 1 or p < 0):
    #     print "Error : Percentile p must be between 0 and 1."
    #     exit
    cum = stats.cumfreq(x, nbins)
    a = cum[0] / len(x)
    lowlim = cum[1]
    bsize = cum[2]
    uplim = lowlim + bsize * nbins
    bins = np.linspace(lowlim + bsize / 2, uplim - bsize / 2, nbins)
    freqs = interpolate.interp1d(a, bins)
    f = freqs(p)
    return f
Esempio n. 29
0
def iecdf(x, p, nbins=10):
    """f = iecdf(x, p, nbins=10) returns the reciprocal of the empirical cumulative distriution function at ordinate p
    """
    # if (p > 1 or p < 0):
    #     print "Error : Percentile p must be between 0 and 1."
    #     exit
    cum = stats.cumfreq(x, nbins)
    a = cum[0] / len(x)
    lowlim = cum[1]
    bsize = cum[2]
    uplim = lowlim + bsize * nbins
    bins = np.linspace(lowlim + bsize / 2, uplim - bsize / 2, nbins)
    freqs = interpolate.interp1d(a, bins)
    f = freqs(p)
    return f
Esempio n. 30
0
def computeCDF(data, precision=1000):
    from scipy.stats import cumfreq, scoreatpercentile
    maxVal = max(data) + 0.

    freqs, _, _, _ = cumfreq(data, precision)

    freqsNormalized = map(lambda x: x / maxVal, freqs)
    values = []

    step = 100. / precision

    scores = numpy.arange(0, 100 + step, step)
    for s in scores:
        values.append(scoreatpercentile(data, s))

    return values, freqs, freqsNormalized
Esempio n. 31
0
def ecdfSyt(df,Group,Conc,threshold):
    tmp=df[(df['Conc']==Conc) & (df['Group']==Group)]
    tmp['time']=np.round(tmp['time'],1)    
    tmp=tmp.sort('time')
    nbins=np.unique(tmp['time']).size    
    tmp1=cumfreq(tmp['time'].values,numbins=nbins)[0]
    time=np.unique(tmp['time'])    
    tmparray=np.zeros([nbins,4])    
    DF=pd.DataFrame(tmparray,columns=["Group","Conc","time","cumfreq"])
    DF['Group']=[Group]*nbins; DF['Conc']=[Conc]*nbins; DF['time']=time;DF['cumfreq']=tmp1    
    DF['cumfreq']=DF['cumfreq']/DF['cumfreq'].max()
    DF=DF[DF['cumfreq']>threshold]
    DF['cumfreq']=(DF['cumfreq']/(DF['cumfreq'].max()-DF['cumfreq'].min()))-DF['cumfreq'].min()
    
    DF['time']=DF['time']-DF['time'].min()    
    return DF
Esempio n. 32
0
def cumulative_histogram(X, axis, bins=30):
    info = calculate_descriptive_stats(X)
    res = sps.cumfreq(X, numbins=30)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size,
                                     res.cumcount.size)
    axis.bar(
        x,
        res.cumcount / np.max(res.cumcount),
        width=res.binsize,
        color='b',
        label=
        "Count: %d\nMin: %.4f\nMax: %.4f\nMean: %.4f\nStd: %.4f\nSkew: %.4f\nKurt: %.4f"
        % info)
    axis.set_title('Cumulative Histogram of Data')
    axis.set_xlim([x.min(), x.max()])
    axis.legend(loc='lower right')
Esempio n. 33
0
def make_cdf(img, n_bins=256):
    """
    Creates CDF of input image (map from [0,255] to [0,1])

    Inputs:
    - img: input image
    - n_bins: Number of bins used

    Output:
    - Dictionary containing Cummulative frequencies (CDF) of pixel values, 
      contained in array, and number of items (pixels) used to compute CDF
    """
    cdf = stats.cumfreq(img, n_bins, (0, 255))[0]
    cdf_ = {'cdf': np.array(cdf) / int(max(cdf)), 'n_items': int(max(cdf))}

    return cdf_
Esempio n. 34
0
    def histogram(self,
                  data_dict,
                  file_name=False,
                  save=False,
                  resolution=None):

        if type(data_dict) == dict:
            data = []
            for i in data_dict:
                data.extend(data_dict[i])
        elif type(data_dict) == list:
            data = data_dict
        else:
            print("Input must be dictionary or list")
            return

        for j, mape in enumerate(data):
            if mape > 50:
                data[j] = 50

        res = stats.cumfreq(data, numbins=15, defaultreallimits=(0, 50))
        x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size,
                                         res.cumcount.size)
        cum_y = [
            i / (max(res.cumcount) - min(res.cumcount)) * 100
            for i in res.cumcount
        ]

        fig = plt.figure(figsize=(10, 4))
        ax1 = fig.add_subplot(1, 2, 1)
        ax2 = fig.add_subplot(1, 2, 2)
        ax1.hist(data, bins=15, histtype='bar', ec='black')
        ax1.set_title('Histogram')
        ax1.set_xlabel('MAPE(%)', fontsize=12)
        ax1.set_ylabel('Frequency (Days)', fontsize=12)
        # ax2.bar(x, res.cumcount, width=res.binsize)
        ax2.plot(x, cum_y, '-o')
        ax2.set_title('Cumulative Histogram')
        ax2.set_xlim([x.min(), x.max()])
        ax2.set_xlabel('MAPE(%)', fontsize=12)
        ax2.set_ylabel('Dataset Percentage (%)', fontsize=12)

        if save is True:
            plt.savefig(file_name + '.jpg',
                        format='jpg',
                        dpi=resolution,
                        bbox_inches='tight')
def KS_principle(inData):
    '''Show the principle of the Kolmogorov-Smirnov test.'''
    
    # CDF of normally distributed data
    nd = stats.norm()
    nd_x = np.linspace(-4, 4, 101)
    nd_y = nd.cdf(nd_x)
    
    # Empirical CDF of the sample data, which range for approximately 0 to 10
    numPts = 50
    lowerLim = 0
    upperLim = 10
    ecdf_x = np.linspace(lowerLim, upperLim, numPts)
    ecdf_y = stats.cumfreq(data, numPts, (lowerLim, upperLim))[0]/len(inData)
    
    #Add zero-point by hand
    ecdf_x = np.hstack((0., ecdf_x))
    ecdf_y = np.hstack((0., ecdf_y))
    
    # Plot the data
    sns.set_style('ticks')
    sns.set_context('poster')
    setFonts(36)
    
    plt.plot(nd_x, nd_y, 'k--')
    plt.hold(True)
    plt.plot(ecdf_x, ecdf_y, color='k')
    plt.xlabel('X')
    plt.ylabel('Cumulative Probability')
    
    # For the arrow, find the start
    ecdf_startIndex = np.min(np.where(ecdf_x >= 2))
    arrowStart = np.array([ecdf_x[ecdf_startIndex], ecdf_y[ecdf_startIndex]])
    
    nd_startIndex = np.min(np.where(nd_x >= 2))
    arrowEnd = np.array([nd_x[nd_startIndex], nd_y[nd_startIndex]])
    arrowDelta = arrowEnd - arrowStart
    
    plt.arrow(arrowStart[0], arrowStart[1], 0, arrowDelta[1],
              width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k')
    
    plt.arrow(arrowStart[0], arrowStart[1]+arrowDelta[1], 0, -arrowDelta[1],
              width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k')
    
    outFile = 'KS_Example.png'
    showData(outFile)
Esempio n. 36
0
def get_null_reference_cdf(
    lowerlimit: np.float32,
    upperlimit: np.float32,
    numbins: int = 1000,
) -> ModifiedECDF:
    """
    This function will return a CDF to be used as a null reference.
    
    :param lowerlimit: lower bound for the CDF
    :param upperlimit: upperbound for the CDF
    :param numbins: How many bins should be used for the reference

    :returns: ModifiedECDF of all zeros for the specified range
    """
    return ModifiedECDF(
        stats.cumfreq([],
                      numbins=numbins,
                      defaultreallimits=(lowerlimit, upperlimit)))
Esempio n. 37
0
def p(hinj=[],
      hrec=[],
      s=[],
      psrname='',
      detname='',
      style=sd.default_style,
      methods=[]):

    for method in methods:
        # First Calculate the interquartile range
        #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755)
        data = np.sort(hrec)
        upperQuartile = stats.scoreatpercentile(data, .75)
        lowerQuartile = stats.scoreatpercentile(data, .25)
        IQR = upperQuartile - lowerQuartile

        # Get ideal bin size
        #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule)
        #         fdsize = 3.49*np.std(data)*len(data)**(-1./3.)
        fdsize = 2 * IQR * len(data)**(-1. / 3.)

        #Get number of bins
        #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30)
        num_bins = int((np.amax(data) - np.amin(data)) / fdsize)

        cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins)
        pv = [1. - cdf / max(cumfreqs) for cdf in cumfreqs]
        bins = np.linspace(lowlim, num_bins * binsize, num_bins)

        plt.plot(bins, pv, style, color=sd.sd.pltcolor[method], label=method)

        plt.yscale('log')

    plt.title(detname + ' PSR ' + psrname)

    plt.xlabel('$h_{rec}$')
    plt.ylabel('1 - CDF (log scale)')

    plt.legend(numpoints=1)
    plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight')

    print 'Plotted and saved in: ',
    print 'plots/p_' + detname + '_' + psrname
    plt.close()
Esempio n. 38
0
def p_original(detector, psr, location='files/remote/source/'):
    d = pd.HDFStore(location + 'dataPitkin_' + detector + '.hdf5', 'r')
    a = d[psr].tolist()
    b = [abs(x) for x in a]

    # First Calculate the interquartile range
    #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755)                                                                    
    data = np.sort(d[psr].tolist())                                                                                                   
    upperQuartile = stats.scoreatpercentile(data,.75)                                                                      
    lowerQuartile = stats.scoreatpercentile(data,.25)                                                                      
    IQR = upperQuartile - lowerQuartile
    
    
        # Get ideal bin size
        #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule)
#         fdsize = 3.49*np.std(data)*len(data)**(-1./3.)
    fdsize = 2 * IQR * len(data)**(-1./3.)
        
    #Get number of bins
    #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30)
    num_bins = int((np.amax(data) - np.amin(data))/fdsize)

    cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins)
    pv = [1. - cdf/max(cumfreqs) for cdf in cumfreqs]
    bins = np.linspace(lowlim, num_bins*binsize, num_bins)

    plt.plot(bins, pv, style, color=sd.pltcolor[method], label=method)
    
    plt.yscale('log')

    plt.title(detname + ' PSR ' + psrname)

    plt.xlabel('$h_{rec}$')
    plt.ylabel('1 - CDF (log scale)')

    plt.legend(numpoints=1)
    plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight')
    
    print 'Plotted and saved in: ',
    print 'plots/p_' + detname + '_' + psrname
    plt.close()
Esempio n. 39
0
def p(hinj=[], hrec=[], s=[], psrname='', detname='', style=sd.default_style, methods=[]):
        
    for method in methods:
        # First Calculate the interquartile range
        #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755)                                                                    
        data = np.sort(hrec)                                                                                                   
        upperQuartile = stats.scoreatpercentile(data,.75)                                                                      
        lowerQuartile = stats.scoreatpercentile(data,.25)                                                                      
        IQR = upperQuartile - lowerQuartile
    
    
        # Get ideal bin size
        #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule)
#         fdsize = 3.49*np.std(data)*len(data)**(-1./3.)
        fdsize = 2 * IQR * len(data)**(-1./3.)
            
        #Get number of bins
        #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30)
        num_bins = int((np.amax(data) - np.amin(data))/fdsize)

        cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins)
        pv = [1. - cdf/max(cumfreqs) for cdf in cumfreqs]
        bins = np.linspace(lowlim, num_bins*binsize, num_bins)

        plt.plot(bins, pv, style, color=sd.sd.pltcolor[method], label=method)
        
        plt.yscale('log')

    plt.title(detname + ' PSR ' + psrname)

    plt.xlabel('$h_{rec}$')
    plt.ylabel('1 - CDF (log scale)')

    plt.legend(numpoints=1)
    plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight')
    
    print 'Plotted and saved in: ',
    print 'plots/p_' + detname + '_' + psrname
    plt.close()
Esempio n. 40
0
def plot_cdfs(benchmark, benchmark_experiments, os_type, results_dir, output_dir, output_extension):
  print "Parsing and plotting runtime results for %d %s experiments...\n" % (len(benchmark_experiments), benchmark)

  runtime_results = parse_runtime_results(benchmark, benchmark_experiments, os_type, aggregate=False)

  if len(runtime_results) == 0:
    print "Not enough results found for %s. Skipping..." % benchmark
    return

  keyed_by_mem_size = defaultdict(list)
  for jvm_count, memsize_to_results in sorted(runtime_results.iteritems(), key=lambda t: t[0]):
    for memsize, runtimes in memsize_to_results.iteritems():
      keyed_by_mem_size[memsize].append((jvm_count, runtimes))

  for mem_size, jvm_to_runtimes in sorted(keyed_by_mem_size.iteritems(), key=lambda t: t[0]):
    plt.clf()
    ax = plt.subplot(111)
    longest_time = max(reduce(lambda x,y: x + y, [t[1] for t in jvm_to_runtimes]))
    shortest_time = min(reduce(lambda x,y: x + y, [t[1] for t in jvm_to_runtimes]))
    for jvm_count, runtime_list in jvm_to_runtimes:
      cum_freqs, ll, binsize, xp = cumfreq(runtime_list, numbins=len(runtime_list))
      normed_cum_freqs = map(lambda x: x/max(cum_freqs), cum_freqs)
      padded_x = [shortest_time*0.8, min(runtime_list)] + sorted(runtime_list) + [longest_time*1.1]
      padded_y = [0, 0] + normed_cum_freqs + [1]
      ax.plot(padded_x, padded_y, label="%d JVMs" % jvm_count)

    # Apply labels and bounds
    plt.title("%s Mean Iteration Runtime CDF (%d MB Heap)" % (benchmark, mem_size))
    plt.ylabel("Fraction of Jobs Completed")
    plt.xlabel("Time (ms)")
    plt.xlim(shortest_time*0.8, longest_time*1.1)
    plt.ylim(-0.025, 1.025)
    # Move legend to the right
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.85, box.height])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    save_or_show_current(output_dir, 'cdfs', benchmark, output_extension, suffix='%03dMB' % mem_size)
Esempio n. 41
0
def gmm_test(X,k0,k1,nboot):
    nsample = X.shape[0]

    gmm0 = mixture.GMM(n_components=k0, covariance_type='full')
    gmm0.fit(X)
    L0 = sum(gmm0.score(X))
    gmm1 = mixture.GMM(n_components=k1, covariance_type='full')
    gmm1.fit(X)
    L1 = sum(gmm1.score(X))
    LRstat = -2*(L1 - L0)
    
    LRstat0 = []

    for i in range(nboot):
        Xboot = gmm0.sample(n_samples=nsample)
        gmm0_boot = mixture.GMM(n_components=k0, covariance_type = 'full')
        gmm0_boot.fit(Xboot)
        L0_boot = sum(gmm0_boot.score(Xboot))
        gmm1_boot = mixture.GMM(n_components=k1, covariance_type = 'full')
        gmm1_boot.fit(Xboot)
        L1_boot = sum(gmm1_boot.score(Xboot))
        LRstat0.append(-2*(L1_boot - L0_boot))

    ecdf, lowlim, binsize, extrapoints = cumfreq(LRstat0)
    ecdf = ecdf/len(LRstat0)

    bin = np.mean([lowlim,lowlim+binsize])
    bins = []

    for i in range(len(ecdf)):
        bins.append(bin)
        bin = bin + binsize

    p = max(ecdf[bins<=LRstat])

    return p
Esempio n. 42
0
File: los.py Progetto: mac389/McAfee
#-------------------------------------------------------------------------------------------------	
##################################################################################################

#load data
data = array([4.92, 6.52,7.33, 5.75])



#bootstrapping, assumes that the data here completely describe (are completely representative thereof) the underlying distribution.
sample_count = data.shape[0]
variable_count = 1
jitter_count = 1000

replicates = tile(data,(jitter_count,1))
replicates += (random(replicates.shape)*(max(data)-min(data))+min(data))
map(shuffle,replicates)
distribution = ravel(diff(replicates,axis=1))
cdf = cumfreq(distribution)

overview = plt.figure(figsize =(8.27,11.69)) #Thus instructeth PDM
ax = overview.add_subplot(111)
xvals = linspace(cdf[1],cdf[1]+10*cdf[2],num=10) #By default cumfreq divides into 10 bins
n,bins,patches=ax.hist(distribution, normed=True)
plt.setp(patches, 'facecolor', 'k', 'alpha', 0.75)
tech.adjust_spines(ax,['left','bottom'])
overview.text(0.5, 0.08, r'Weekly Change in Length of Stay', ha='center', va='top', fontsize=30, weight='bold') #xlabel
overview.text(0.02875, 0.5, r'Chance of Occurrence', ha='center', va='center', rotation='vertical', fontsize=30, weight='bold')
ax.annotate(r'April 21, 2010',xy=(1.1,.18), xytext=(1.2,.2),arrowprops=dict(facecolor='black', shrink=0.05), fontsize=20)
plt.subplots_adjust(top=0.95, bottom =0.18, left=0.15)
plt.savefig('cdf_LOS.jpg',dpi=600)
def simplePlots():
    '''Demonstrate the generation of different statistical standard plots'''
    
    # Univariate data -------------------------
    
    # Make sure that always the same random numbers are generated
    np.random.seed(1234)
    
    # Generate data that are normally distributed
    x = np.random.randn(500)
    
    # Other graphics settings
    sns.set(context='poster', style='ticks', palette=sns.color_palette('muted'))
    
    # Set the fonts the way I like them
    setFonts(32)
    
    # Scatter plot
    plt.scatter(np.arange(len(x)), x)
    plt.xlim([0, len(x)])
    
    # Save and show the data, in a systematic format
    printout('scatterPlot.png', xlabel='Datapoints', ylabel='Values', title='Scatter')
    
    # Histogram
    plt.hist(x)
    printout('histogram_plain.png', xlabel='Data Values',
             ylabel='Frequency', title='Histogram, default settings')
    
    plt.hist(x,25)
    printout('histogram.png', xlabel='Data Values', ylabel='Frequency',
             title='Histogram, 25 bins')
    
    # Cumulative probability density
    numbins = 20
    plt.plot(stats.cumfreq(x,numbins)[0])
    printout('CumulativeFrequencyFunction.png', xlabel='Data Values',
             ylabel='CumFreq', title='Cumulative Frequency')

    # KDE-plot
    sns.kdeplot(x)
    printout('kde.png', xlabel='Data Values', ylabel='Density',
            title='KDE_plot')
    
    # Boxplot
    # The ox consists of the first, second (middle) and third quartile
    plt.boxplot(x, sym='*')
    printout('boxplot.png', xlabel='Values', title='Boxplot')
    
    plt.boxplot(x, sym='*', vert=False)
    plt.title('Boxplot, horizontal')
    plt.xlabel('Values')
    plt.show()
    
    # Errorbars
    x = np.arange(5)
    y = x**2
    errorBar = x/2
    plt.errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3)
    plt.xlim([-0.2, 4.2])
    plt.ylim([-0.2, 19])
    printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars')
    
    # Violinplot
    nd = stats.norm
    data = nd.rvs(size=(100))
    
    nd2 = stats.norm(loc = 3, scale = 1.5)
    data2 = nd2.rvs(size=(100))
    
    # Use pandas and the seaborn package for the violin plot
    df = pd.DataFrame({'Girls':data, 'Boys':data2})
    sns.violinplot(df)
    
    printout('violinplot.png', title='Violinplot')
    
    # Barplot
    # The font-size is set such that the legend does not overlap with the data
    np.random.seed(1234)
    setFonts(20)
    
    df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
    df.plot(kind='bar', grid=False, color=sns.color_palette('muted'))
    
    showData('barplot.png')
    setFonts(28)

    # Bivariate Plots
    df2 = pd.DataFrame(np.random.rand(50, 3), columns=['a', 'b', 'c'])
    df2.plot(kind='scatter', x='a', y='b', s=df2['c']*500);
    plt.axhline(0, ls='--', color='#999999')
    plt.axvline(0, ls='--', color='#999999')
    printout('bivariate.png')
    
    # Grouped Boxplot
    sns.set_style('whitegrid')
    sns.boxplot(df)
    setFonts(28)
    printout('groupedBoxplot.png', title='sns.boxplot')

    sns.set_style('ticks')

    # Pieplot
    txtLabels = 'Cats', 'Dogs', 'Frogs', 'Others'
    fractions = [45, 30, 15, 10]
    offsets =(0, 0.05, 0, 0)
    
    plt.pie(fractions, explode=offsets, labels=txtLabels,
            autopct='%1.1f%%', shadow=True, startangle=90,
            colors=sns.color_palette('muted') )
    plt.axis('equal')
    printout('piePlot.png', title=' ')
Esempio n. 44
0
density = gaussian_kde(list_merged_by_ball_id)
xs = numpy.linspace(0,8,200)
density.covariance_factor = lambda : .25
density._compute_covariance()
plt.plot(xs,density(xs))
plt.xlabel('KDE,number of appear time by blue ball number')
plt.ylabel('KDE,counter of appear time by blue ball number')
plt.show()
##CDF(The Cumulative Distribution Function
from scipy.stats import cumfreq
idx_max = max(dfs_blue_balls_count_values)
hi = idx_max
a = numpy.arange(hi) ** 2
#    for nbins in ( 2, 20, 100 ):
for nbins in dfs_blue_balls_count_values:    
    cf = cumfreq(a, nbins)  # bin values, lowerlimit, binsize, extrapoints
    w = hi / nbins
    x = numpy.linspace( w/2, hi - w/2, nbins )  # care
    # print x, cf
    plt.plot( x, cf[0], label=str(nbins) )

plt.legend()
plt.xlabel('CDF,number of appear time by blue ball number')
plt.ylabel('CDF,counter of appear time by blue ball number')
plt.show()

###Optional: Comparing Distributions with Probability Plots and QQ Plots
###Quantile plot of the server data. A quantile plot is a graph of the CDF with the x and y axes interchanged.
###Probability plot for the data set shown,a standard normal distribution:
###@see: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.probplot.html
import scipy.stats as stats
Esempio n. 45
0
# Histogram
hist(x)
xlabel('Data Values')
ylabel('Frequency')
title('Histogram, default settings')
show()

hist(x,25)
xlabel('Data Values')
ylabel('Frequency')
title('Histogram, 25 bins')
show()

# Cumulative probability density
numbins = 20
cdf = stats.cumfreq(x,numbins)
plot(cdf[0])
xlabel('Data Values')
ylabel('Cumulative Frequency')
title('Cumulative probablity density function')
show()

# Boxplot
# The error bars indiacte the range, and the box consists of the
# first, second (middle) and third quartile
boxplot(x)
title('Boxplot')
ylabel('Values')
show()

boxplot(x, vert=False)
Esempio n. 46
0
    def execute(self,
                sqr_nodes,
                connectivity,
                randomize_boot,
                sec_before_inject,
                sec_after_inject,
                inject_node,
                k,
                distance,
                filenamebase):

        print "="*40
        print "Executing HistGraph:"
        print "filenamebase\t\t", filenamebase
        print "="*40

        node_re = 'DEBUG \((\d+)\):'
        node_re_c = re.compile(node_re)
        time_re = '(\d+):(\d+):(\d+.\d+)'
        time_re_c = re.compile(time_re)

        consist = np.zeros((sqr_nodes, sqr_nodes))

        f = open(filenamebase+".log", "r")
        for line in f:
            #print line,
            if line.find("inconsistent") >= 0:
                #print line,

                node_obj = node_re_c.search(line)
                node = int(node_obj.group(1))

                time_obj = time_re_c.search(line)
                #print "\t", time_obj.group(0),
                t = Time(time_obj.group(1),
                         time_obj.group(2),
                         time_obj.group(3))
                #print t.in_second()

                #print "id", node,
                (x, y) = id2xy(node, sqr_nodes)
                #print "->", x, y

                consist[x][y] = t.in_second() - sec_before_inject

        f.close()

        LOW_TIME = 0
        HIGH_TIME = 50
        BINS = 100

        #print consist.flatten()
        cdf = stats.cumfreq(consist.flatten(), BINS, (LOW_TIME, HIGH_TIME))
        #print cdf #, max(cdf[0]), cdf[0]/max(cdf[0])
        #print floatRange(LOW_TIME, HIGH_TIME, cdf[2])

        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111)

        plt.plot(floatRange(LOW_TIME, HIGH_TIME, cdf[2]),
                 cdf[0]/max(cdf[0]),
                 ls='steps')
        # plt.hist(consist.flatten(),
        #          bins = 100,
        #          cumulative=True,
        #          normed=True,
        #          histtype='step')
        plt.grid()
        plt.title('Model Time to Consistency (cdf)')
        text = str(sqr_nodes) + "x" + str(sqr_nodes) + "\n" + \
            "Distance: " + str(distance) + "\n" + \
            "K: " + str(k)
#            "Connectivity: " + str(connectivity) + "\n" + \
        plt.text(.5, .1, text,
                 horizontalalignment='center',
                 verticalalignment='center',
                 transform = ax.transAxes,
                 bbox=dict(facecolor='red', alpha=0.2))

        plt.ylim(0,
                 1)
        #plt.xlim(0,
        #         50)

        plt.xlabel("Model Time [s]")
        plt.ylabel("Nodes consistent [%]")

        plt.savefig(filenamebase+"_hist.png")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Esempio n. 48
0
def main():
    # Univariate data -------------------------
    # Generate data that are normally distributed
    x = randn(500)
    
    # Scatter plot
    plot(x,'.')
    title('Scatter Plot')
    xlabel('X')
    ylabel('Y')
    draw()
    show()
    
    # Histogram
    hist(x)
    xlabel('Data Values')
    ylabel('Frequency')
    title('Histogram, default settings')
    show()
    
    hist(x,25)
    xlabel('Data Values')
    ylabel('Frequency')
    title('Histogram, 25 bins')
    show()
    
    # Cumulative probability density
    numbins = 20
    cdf = stats.cumfreq(x,numbins)
    plot(cdf[0])
    xlabel('Data Values')
    ylabel('Cumulative Frequency')
    title('Cumulative probablity density function')
    show()
    
    # Boxplot
    # The error bars indiacte the range, and the box consists of the
    # first, second (middle) and third quartile
    boxplot(x)
    title('Boxplot')
    ylabel('Values')
    show()
    
    boxplot(x, vert=False)
    title('Boxplot, horizontal')
    xlabel('Values')
    show()
    
    # Check for normality
    _ = stats.probplot(x, plot=plt)
    title('Probplot - check for normality')
    show()
    
    # Bivariate data -------------------------
    
    # Generate data
    x = randn(200)
    y = 10+0.5*x+randn(len(x))
    
    # Scatter plot
    scatter(x,y)
    # This one is quite similar to "plot(x,y,'.')"
    title('Scatter plot of data')
    xlabel('X')
    ylabel('Y')
    show()
    
    # LineFit
    M = vstack((ones(len(x)), x)).T
    pars = linalg.lstsq(M,y)[0]
    intercept = pars[0]
    slope = pars[1]
    scatter(x,y)
    hold(True)
    plot(x, intercept + slope*x, 'r')
    show()
Esempio n. 49
0
def main():
    # Univariate data -------------------------
    # Generate data that are normally distributed
    x = np.random.randn(500)
    
    # Set the fonts the way I like them
    sns.set_context('poster')
    sns.set_style('ticks')
    #mystyle.set()
    
    # Scatter plot
    plt.scatter(np.arange(len(x)), x)
    plt.xlim([0, len(x)])
    mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter')
    
    # Histogram
    plt.hist(x)
    mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings')
    
    plt.hist(x,25)
    mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins')
    
    # Cumulative probability density
    numbins = 20
    plt.plot(stats.cumfreq(x,numbins)[0])
    mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='CumFreq', title='Cumulative Frequncy')

    # KDE-plot
    sns.kdeplot(x)
    mystyle.printout('kde.png', xlabel='Data Values', ylabel='Density',
            title='KDE_plot')
    
    # Boxplot
    # The ox consists of the first, second (middle) and third quartile
    plt.boxplot(x, sym='*')
    mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot')
    
    plt.boxplot(x, sym='*', vert=False)
    plt.title('Boxplot, horizontal')
    plt.xlabel('Values')
    plt.show()
    
    # Errorbars
    x = np.arange(5)
    y = x**2
    errorBar = x/2
    plt.errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3)
    plt.xlim([-0.2, 4.2])
    plt.ylim([-0.2, 19])
    mystyle.printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars')
    
    # Violinplot
    nd = stats.norm
    data = nd.rvs(size=(100))
    
    nd2 = stats.norm(loc = 3, scale = 1.5)
    data2 = nd2.rvs(size=(100))
    
    # Use pandas and the seaborn package for the violin plot
    df = pd.DataFrame({'Girls':data, 'Boys':data2})
    #sns.violinplot(df, color = ["#999999", "#DDDDDD"])
    sns.violinplot(df)
    
    mystyle.printout('violinplot.png', title='Violinplot')
    
    # Barplot
    df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
    df.plot(kind='bar', grid=False)
    mystyle.printout('barplot.png', title='Barplot')

    # Grouped Boxplot
    sns.set_style('whitegrid')
    sns.boxplot(df)
    mystyle.printout('groupedBoxplot.png', title='sns.boxplot')

    # Bivariate Plots
    df2 = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd'])
    df2.plot(kind='scatter', x='a', y='b', s=df['c']*300);
    mystyle.printout('bivariate.png')

    # Pieplot
    series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series')
    sns.set_palette("husl")
    series.plot(kind='pie', figsize=(6, 6))
    mystyle.printout('piePlot.png', title='pie-plot')
def plot_acceleration_or_instantaneous_curves(number_of_repacks):
    def check_for_zeros(latitude, longitude, latitude_index, longitude_index, current_value):
        if current_value == 0:
            return 1
        return 0

    def or_function(this_value, other_value):
        return numpy.logical_or(this_value, other_value)

    def calculate_population_of_zerows(datamap, populationmap):
        population = 0
        for i in range(400):
            for j in range(600):
                if datamap.get_value_by_index(i, j) == 1:
                    population += populationmap.get_value_by_index(i, j)
        return population


    colors = {7: 'b', 14: 'r', 22: 'g', 25: 'm'}


    for num_channels_removed in [25]:
        zerows_map = west.data_map.DataMap2DContinentalUnitedStates.create(400, 600)
        zerows_map.reset_all_values(0)

        datamap_spec = west.data_management.SpecificationDataMap(west.data_map.DataMap2DContinentalUnitedStates, 400, 600)
        is_in_region_map_spec = west.data_management.SpecificationRegionMap(west.boundary.BoundaryContinentalUnitedStates, datamap_spec)
        is_in_region_map = is_in_region_map_spec.fetch_data()
        population_map_spec = west.data_management.SpecificationPopulationMap(is_in_region_map_spec, west.population.PopulationData)
        population_map = population_map_spec.fetch_data()

        instantaneous_values = numpy.zeros(number_of_repacks)
        acceleration_values = numpy.zeros(number_of_repacks)
        num_repacks_index  = numpy.arange(number_of_repacks)

        if num_channels_removed == 25:
            repack_file_list = os.listdir(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsRemoved"%num_channels_removed, "Only UHF"))
            repack_file_list = repack_file_list[1:]

        else:
            repack_file_list = []
            for i in range(number_of_repacks):
                repack_file_list.append("%dUHFnewUSMinimumStationstoRemove_OnlyUHF_PLMRS_FCCcontours%d.pcl"%(num_channels_removed, i))
        for i in range(number_of_repacks):
            print i
            print repack_file_list[i]
            if num_channels_removed == 25:
                wsmap = west.data_map.DataMap2DContinentalUnitedStates.from_pickle(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsRemoved"%num_channels_removed, "Only UHF", repack_file_list[i]))
            else:
                wsmap = west.data_map.DataMap2DContinentalUnitedStates.from_pickle(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsREmoved"%num_channels_removed, repack_file_list[i]))
            wsmap.update_all_values_via_function(check_for_zeros)
            zerows_map = zerows_map.combine_datamaps_with_function(wsmap, or_function)

            instantaneous_values[i] = calculate_population_of_zerows(wsmap, population_map)
            acceleration_values[i] = calculate_population_of_zerows(zerows_map, population_map)

            print instantaneous_values[i], acceleration_values[i]


        from scipy.stats import cumfreq
        num_bins = 100
        inst_values_cdf = cumfreq(instantaneous_values, num_bins)
        xaxis = numpy.linspace(0, max(instantaneous_values), num_bins)
        plt.plot(xaxis, inst_values_cdf[0]/number_of_repacks)

    plt.xlabel("Population that sees zero whitespace after repack")
    plt.ylabel("CDF")
    plt.show()
Esempio n. 51
0
# plot the histogram
plt.clf()
plt.bar(bins[:-1], n, width=0.4, color='red')
plt.xlabel('X', fontsize=20)
plt.ylabel('number of data points in the bin', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/hist.png')

# compute and plot the relfreq
relfreqs, lowlim, binsize, extrapoints = st.relfreq(x)
plt.clf()
plt.bar(bins[:-1], relfreqs, width=0.4, color='magenta')
plt.xlabel('X', fontsize=20)
plt.ylabel('Relative frequencies', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/relfreq.png')

# compute and plot pdf
plt.clf()
n, bins, patches = plt.hist(x, 10, normed=1, facecolor='yellow', alpha=0.5)
plt.xlabel('X', fontsize=15)
plt.ylabel('PDF', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/pdf.png')

# compute and plot cdf
cumfreqs, lowlim, binsize, extrapoints = st.cumfreq(x)
plt.clf()
plt.bar(bins[:-1], cumfreqs/cumfreqs[-1], width=0.4, color='black', alpha=0.45)
plt.xlabel('X', fontsize=15)
plt.ylabel('CDF', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/cdf.png')

label_files = [open(filename).readlines() for filename in os.listdir(".") if item.endswith(label_extension)]
#NEED TO FIX


#bootstrapping, assumes that the data here completely describe (are completely representative thereof) the underlying distribution.
sample_count = data_nodates.shape[0]
variable_count = data_nodates.shape[1]
jitter_count = 1000


distributions = reshape(array([diff(shuffler(data_nodates),axis=0) for jitter in range(jitter_count)]),((sample_count-1)*jitter_count,variable_count))

#generate cdfs


cdfs = array([(cumfreq(distribution)[0:3]) for distribution in distributions.transpose()])
 #-1 because the list of differences of n sample counts will have n-1 members
print cdfs[1]
overview = plt.figure()
ax = overview.add_subplot(111)
for cdf in cdfs:
	xvals = array([ cdf[1] + i*cdf[2] for i in range(10)])
	h, = ax.plot(xvals,cdf[0]/max(cdf[0]),'--.',markersize=30)
	h.set_clip_on(False)
#plt.legend(('CXR','ABD CT','ABD + Chest CT'),'lower right', numpoints=1, fancybox=True, frameon=False, bbox_to_anchor=(1.1,0.2))
tech.adjust_spines(ax,['left','bottom'])
overview.text(0.5, 0.08, r'Weekly Change in Cases', ha='center', va='top', fontsize=30, weight='bold') #xlabel
overview.text(0.02875, 0.5, r'Frequency of Occurrence', ha='center', va='center', rotation='vertical', fontsize=30, weight='bold')
plt.subplots_adjust(top=0.95, bottom =0.18, left=0.15)
plt.savefig('cdf_CXRT_s.png',dpi=300)
Esempio n. 53
0
def main():
    # Univariate data -------------------------
    # Generate data that are normally distributed
    x = randn(500)
    
    # Scatter plot
    plot(x,'.')
    title('Scatter Plot')
    xlabel('X')
    ylabel('Y')
    draw()
    show()
    
    # Histogram
    hist(x)
    xlabel('Data Values')
    ylabel('Frequency')
    title('Histogram, default settings')
    show()
    
    hist(x,25)
    xlabel('Data Values')
    ylabel('Frequency')
    title('Histogram, 25 bins')
    show()
    
    # Cumulative probability density
    numbins = 20
    cdf = stats.cumfreq(x,numbins)
    plot(cdf[0])
    xlabel('Data Values')
    ylabel('Cumulative Frequency')
    title('Cumulative probablity density function')
    show()
    
    # Boxplot
    # The error bars indiacte the range, and the box consists of the
    # first, second (middle) and third quartile
    boxplot(x)
    title('Boxplot')
    ylabel('Values')
    show()
    
    boxplot(x, vert=False)
    title('Boxplot, horizontal')
    xlabel('Values')
    show()
    
    # Violinplot
    nd = stats.norm
    data = nd.rvs(size=(100))
    nd2 = stats.norm(loc = 0.5, scale = 1.2)
    data2 = nd2.rvs(size=(100))
    
    # Use the seaborn package for the violin plot, and set the context for "poster"
    sns.set(context='poster')
    df = pd.DataFrame({'Girls':data, 'Boys':data2})
    sns.violinplot(df)
    show()
    
    # Check for normality
    _ = stats.probplot(x, plot=plt)
    title('Probplot - check for normality')
    show()
    
    # Bivariate data -------------------------
    
    # Generate data
    x = randn(200)
    y = 10+0.5*x+randn(len(x))
    
    # Scatter plot
    scatter(x,y)
    # This one is quite similar to "plot(x,y,'.')"
    title('Scatter plot of data')
    xlabel('X')
    ylabel('Y')
    show()
    
    # LineFit
    M = vstack((ones(len(x)), x)).T
    pars = linalg.lstsq(M,y)[0]
    intercept = pars[0]
    slope = pars[1]
    scatter(x,y)
    hold(True)
    plot(x, intercept + slope*x, 'r')
    show()
Esempio n. 54
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import cumfreq

P = 0.4
Q = 1 - P 

a = np.random.geometric(P, size=1000)
b = np.random.geometric(P, size=1000)
c = np.random.geometric(P, size=1000)

common_params = dict(bins=[x for x in range(1,14)], normed=1, range=(0,15))

plt.title('Histograma comparativo de simulaciones y probabilidad teorica')
plt.ylabel('Frecuencia relativa')
plt.xlabel('Valores')
plt.hist([a, b, c], **common_params)

gteo = [P * pow(Q, i-1) for i in range(1,14)]
g = cumfreq(gteo, 15)



plt.show()

Esempio n. 55
0
def histplot1D(datain,**kwargs):
    
    datatype=kwargs.get('datatype','df')
    
    if datatype is 'df':
        histvals=datain.values
        binrange=kwargs.get('binrange',[datain.min(),datain.max()])
    elif datatype is 'histdict':
        counts=datain['counts']
        edges=datain['edges']
        centers=datain['centers']
        step=np.diff(edges)
        
    numbins=kwargs.get('numbins',100)  
    missinglowval=kwargs.get('missinghighval',-99999)
    missinghighval=kwargs.get('missinglowval',99999)
    normalize=kwargs.get('normalize',True)
    cumulative=kwargs.get('cumulative',False)
    
    doplot=kwargs.get('doplot',True)  
    showplot=kwargs.get('showplot',False)
    saveplot=kwargs.get('saveplot',False)
    plotfilename=kwargs.get('plotfilename','1Dhist_test.png')
    fsize=kwargs.get('fsize',32) #baseline font size
    ar=kwargs.get('ar',1.0)  #aspect ratio
    figheight=kwargs.get('figheight',12) #inches 
    dpi=kwargs.get('dpi',100)      
    fignum=kwargs.get('fignum',0)
    xlog=kwargs.get('xlog',False)
    ylog=kwargs.get('ylog',False)
    xlimits=kwargs.get('xlimits',None)
    ylimits=kwargs.get('ylimits',None)
    xlabel=kwargs.get('xlabel',None)
    if ylog:
        ylabel=kwargs.get('ylabel','Log (Counts)')
    else:
        ylabel=kwargs.get('ylabel','Counts')
    
    if datatype is 'histdict':
        dictout=datain
    else:
        if not cumulative:
            counts,edges=np.histogram(datain.values,numbins,range=binrange,normed=normalize) 
            step=np.diff(edges)
            centers=edges[:-1]+step*0.5            
            dictout={'counts':counts,'centers':centers,'edges':edges}
        else:            
            counts,lowlim,barwidths,extrapoints=cumfreq(datain.values,numbins=numbins,defaultreallimits=binrange)
            if normalize:
                totcounts=((datain.values>missinglowval)&(datain.values<missinghighval)).sum()
                counts=counts/totcounts
            step=(binrange[1]-binrange[0])/numbins
            binvals=np.arange(binrange[0],binrange[1],step)
            centers=[v+step*0.5 for v in binvals]
            edges=np.hstack((binvals,binvals[-1]+step))
            dictout={'counts':counts,'centers':centers,'edges':edges}
    if doplot:
        plt.rc('font', family='serif', size=fsize)
        fig1=plt.figure(fignum)
        if ar:
            fig1.set_size_inches(figheight*ar,figheight)
        ax1=fig1.add_subplot(111)
        barwidths=step
        if xlog:
            logplot=True
            plt.xscale('log')
        else:
            logplot=False
        
        if ylog:
            logplot=True
            plt.yscale('log')
        else:
            logplot=False
            ax1.set_aspect(ar)
            
        ax1.bar(centers,counts,width=barwidths,align='center',log=logplot)  
        

        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        if ylimits:
            plt.ylim(ylimits)
        if xlimits:
            plt.xlim(xlimits)
        if saveplot:
            fig1.canvas.print_figure(plotfilename,dpi = dpi, edgecolor = 'b', bbox_inches = 'tight') 
        if showplot:
            fig1.canvas.draw()
    return dictout
Esempio n. 56
0
 def _calcCMC(self, size):
     cumfreqs = (cumfreq(self.matching_order, numbins=size)[0] / size) * 100.
     self.CMC = cumfreqs.astype(np.float32)
## plot
# countMax = max(overlapSer)
# bins = np.arange(countMax+1)
# plt.hist(overlapSer,bins)
# plt.ylabel('freq',fontweight='bold')
# plt.xlabel('number of cell lines',fontweight='bold')
# plt.title('summlySpace DOS compounds (' + str(overlapCount) + ') - cell lines is_gold')
# plt.xticks(bins)
# outF = os.path.join(wkdir, 'DOS_summly_cell_line_distribution.png')
# plt.savefig(outF, bbox_inches='tight',dpi=200)
# plt.close()


from scipy.stats import cumfreq
num_bins=20
b, lowlim, binsize, extrapoints=cumfreq(passSer,num_bins)
outF = os.path.join(wkdir, 'cdf_test.png')
plt.plot(b)
plt.savefig(outF, bbox_inches='tight',dpi=200)
plt.close()

#cumsum
dx = .01
X  = np.arange(-2,2,dx)
Y  = pylab.exp(-X**2)
# Normalize the data to a proper PDF
Y /= (dx*Y).sum()

# Compute the CDF
CY = np.cumsum(Y*dx)
Esempio n. 58
0
    build_count_graph([(name, [s.rtt / 1000 for s in data], color) for name, data, color in datasets], counts_of='ms')
    build_legend()
    save_graph("rtt")

"""
This is a CDF of the bandwidth, which is very useful for comparing the overall response of multiple versions/setups
"""

if "cdf" in args.graphs:
    build_graph("Cumulative Distribution of Throughput")

    # setup our x axis based on 1Gbps operation.
    hist_xpoints = range(0, 1000000000, 1000000)
    hist_xticks = xrange(0, 1000000000, 100000000)
    hist_xlabels = ["{:4.0f}".format(t / 1e6) for t in hist_xticks]

    for name, data, color in datasets:
        plt.plot(hist_xpoints[:-1],
                 stats.cumfreq([s.bw for s in data], hist_xpoints, (0, 1e9))[0] / len(data),
                 label=name,
                 color=color)

    plt.xticks(hist_xticks, hist_xlabels)
    hist_yticks = np.arange(0, 101, 10) / 100.0
    plt.yticks(hist_yticks, ["{:2.0%}".format(float(t)) for t in hist_yticks])
    plt.xlabel("Mbps")
    plt.ylabel("percentile")

    build_legend(loc=4)
    save_graph("cdf")
Esempio n. 59
0
        loop_time = time.time() - start_time       
        
        if sdOutput == "NA":
            print "randomSDFun while loop did not run in time"            
        else:
            sd_list.append(sdOutput)
                   
        
    print("--- %s seconds ---" % (time.time() - start_time))
    
    if len(sd_list) == numBoots:
                
        minList=  min(sd_list)
        maxList=  max(sd_list)

        cumFreq = cumfreq(sd_list, numBins, defaultreallimits=(minList, maxList))
        lowerLimit = cumFreq[1]
        countValues =  cumFreq[0]
        freq_interval = cumFreq[2]
        upperLimit = lowerLimit + (freq_interval*numBins)
        xaxis = np.arange(lowerLimit, (upperLimit), freq_interval)
        
        if len(xaxis) > numBins:
            print "xaxis too long, length:", len(xaxis)
            del_index = numBins
            xaxis = np.delete(xaxis, del_index)
        
        

        result = (sd_data -minList) / freq_interval
        myIndex = int(round(result)) -1