Beispiel #1
0
def conditional_mutual_information2(data_x, data_y, data_z, bw_method=None):
    """
    Conditional mutual information estimator.
    :param data_x: first distribution data
    :param data_y: conditioned variables data
    :param data_z: second distribution data
    :param bw_method: parameter of gaussian_kde of scipy
    :return: estimated conditional mutual information
    """
    n_x, n_y, n_z = data_x.shape[1], data_y.shape[1], data_z.shape[1]
    assert n_x == 1

    if data_y.shape[0] == 0:
        return mutual_information(data_x, data_z)
    y_distr = gaussian_kde(np.transpose(data_y), bw_method=bw_method)
    xy_distr = gaussian_kde(np.transpose(np.hstack([data_x, data_y])), bw_method=bw_method)
    xyz_distr = gaussian_kde(np.transpose(np.hstack([data_x, data_y, data_z])), bw_method=bw_method)
    z_distr = gaussian_kde(np.transpose(data_z), bw_method=bw_method)
    yz_distr = gaussian_kde(np.transpose(np.hstack([data_y, data_z])), bw_method=bw_method)

    def f(s):
        x_part = s[:n_x]
        y_part = s[n_x:n_y+n_x]
        xy_part = s[:n_x + n_y]
        yz_part = s[n_x:n_x+n_y+n_z]
        z_part = s[n_x+n_y:n_x+n_y+n_z]
        return log(xyz_distr(s)) + log(y_distr(y_part)) - log(xy_distr(xy_part)) - log(yz_distr(yz_part))

    return monte_carlo_integration(lambda size=1: np.transpose(xyz_distr.resample(size=size)), f)
def hl_distances_from_set(A_list, B, points=65, margin_factor=0.25, bw=None):
    """
    Calculates Hellinger distances of A_list sets from the set B using
    continuous formula.
    """
    if bw is None:
        bw = B.shape[0]**(-1.0/5)*0.5
    yBs = []
    xs = []
    for j in range(B.shape[1]):
        minx, maxx = B[:, j].min(), B[:, j].max()
        margin = (maxx-minx)*margin_factor
        minx -= margin
        maxx += margin
        xs.append(np.linspace(minx, maxx, points))
        try:
            yBs.append(gaussian_kde(B[:, j], bw_method=bw)(xs[-1]))
        except (np.linalg.linalg.LinAlgError, ValueError) as _:
            print("Singular matrix -- unable to perform gaussian KDE.")
            yBs.append(np.zeros(xs[-1].shape))

    for A in A_list:
        if A.shape[0] < 2:
            yield 1.0
        else:
            integral = 1
            for j, yB, x in zip(range(len(yBs)), yBs, xs):
                try:
                    y = (np.sqrt(gaussian_kde(A[:, j], bw_method=bw)(x)) -
                         np.sqrt(yB))**2
                    integral *= (1-0.5*simps(y, dx=(x[1]-x[0])))
                    del x, yB
                except np.linalg.linalg.LinAlgError:
                    integral = 0.0
            yield 1-integral
Beispiel #3
0
def distFunc(ys,xs):
    '''
    # Calculate distance between two empirical distributions.
    # input parameters for model.
    # output: distance between generated data and data xs.
    '''
    if (np.sum(ys)==0):
        return np.inf
    else:
        if xs.ndim == 1:
            kernely = stats.gaussian_kde(ys)
            kernelx = stats.gaussian_kde(xs)
            xx = np.linspace(np.min(xs),np.max(xs)) #range over data.
            return stats.entropy(kernelx(xx),qk=kernely(xx)) #KL-divergence.
        else:
            #dimensions are (npoints,nparams) to keep consistent with sci-kit
            #learn
            kernely = stats.gaussian_kde(ys.T)
            kernelx = stats.gaussian_kde(xs.T)
            #range over n-dimensional data (npoints,nparams)
            mesh = [np.linspace(np.min(xs[:,i]),np.max(xs[:,i]))
                    for i in range(xs.shape[1])]
            xx = np.meshgrid(*mesh)
            xx = np.array([x.ravel() for x in xx]).T
            return stats.entropy(kernelx(xx.T),qk=kernely(xx.T)) #KL-divergence.
Beispiel #4
0
def save_report(
    report_path, prefix, decoys, targets, top_decoys, top_targets, cutoffs, svalues, qvalues, pvalues, lambda_
):

    if plt is None:
        raise ImportError("you need matplotlib package to create a report")

    plt.figure(figsize=(10, 20))
    plt.subplots_adjust(hspace=0.5)

    plt.subplot(511)
    plt.title(prefix + "\n\nROC")
    plt.xlabel("False Positive Rate (qvalue)")
    plt.ylabel("True Positive Rate (svalue)")

    plt.scatter(qvalues, svalues, s=3)
    plt.plot(qvalues, svalues)

    plt.subplot(512)
    plt.title("d_score Performance")
    plt.xlabel("dscore cutoff")
    plt.ylabel("rates")

    plt.scatter(cutoffs, svalues, color="g", s=3)
    plt.plot(cutoffs, svalues, color="g", label="TPR (svalue)")
    plt.scatter(cutoffs, qvalues, color="r", s=3)
    plt.plot(cutoffs, qvalues, color="r", label="FPR (qvalue)")

    plt.subplot(513)
    plt.title("Top Peak Groups' d_score Distributions")
    plt.xlabel("d_score")
    plt.ylabel("# of groups")
    plt.hist([top_targets, top_decoys], 20, color=["w", "r"], label=["target", "decoy"], histtype="bar")
    plt.legend(loc=2)

    plt.subplot(514)
    tdensity = gaussian_kde(top_targets)
    tdensity.covariance_factor = lambda: 0.25
    tdensity._compute_covariance()
    ddensity = gaussian_kde(top_decoys)
    ddensity.covariance_factor = lambda: 0.25
    ddensity._compute_covariance()
    xs = linspace(min(concatenate((top_targets, top_decoys))), max(concatenate((top_targets, top_decoys))), 200)
    plt.title("Top Peak Groups' d_score Density")
    plt.xlabel("d_score")
    plt.ylabel("density")
    plt.plot(xs, tdensity(xs), color="g", label="target")
    plt.plot(xs, ddensity(xs), color="r", label="decoy")
    plt.legend(loc=2)

    plt.subplot(515)
    if pvalues is not None:
        counts, __, __ = plt.hist(pvalues, bins=40)
        y_max = max(counts)
        plt.plot([lambda_, lambda_], [0, y_max], "r")
        plt.title("histogram pvalues")

    plt.savefig(report_path)

    return cutoffs, svalues, qvalues, top_targets, top_decoys
Beispiel #5
0
def conditional_mutual_information(data, x, y, z, bw_method=None):
    """
    Conditional mutual information estimator.
    :param x: variables of the first distribution (list)
    :param y: conditioned variables (list)
    :param z: variables of the second distribution (list)
    :param bw_method: parameter of gaussian_kde of scipy
    :return: estimated conditional mutual information
    """
    n_x, n_y, n_z = len(x), len(y), len(z)
    assert n_x == 1

    if len(y) == 0:
        data_x = data[:, x]
        data_z = data[:, z]
        return mutual_information(data_x, data_z)

    y_distr = gaussian_kde(np.transpose(data[:, y]), bw_method=bw_method)
    xy_distr = gaussian_kde(np.transpose(data[:, x + y]), bw_method=bw_method)
    xyz_distr = gaussian_kde(np.transpose(data[:, x + y + z]), bw_method=bw_method)
    z_distr = gaussian_kde(np.transpose(data[:, z]), bw_method=bw_method)
    yz_distr = gaussian_kde(np.transpose(data[:, y + z]), bw_method=bw_method)

    def f(s):
        x_part = s[:n_x]
        y_part = s[n_x:n_y+n_x]
        xy_part = s[:n_x + n_y]
        yz_part = s[n_x:n_x+n_y+n_z]
        z_part = s[n_x+n_y:n_x+n_y+n_z]
        return log(xyz_distr(s)) + log(y_distr(y_part)) - log(xy_distr(xy_part)) - log(yz_distr(yz_part))

    return monte_carlo_integration(lambda size=1: np.transpose(xyz_distr.resample(size=size)), f)
def test_plot():
    import math
    from numpy.random import normal
    from scipy import stats
    global data

    def f(x):
        return 2*x + 1

    mean = 2
    var = 3
    std = math.sqrt(var)

    data = normal(loc=2, scale=std, size=50000)

    d2 = f(data)
    n = scipy.stats.norm(mean, std)

    kde1 = stats.gaussian_kde(data,  bw_method='silverman')
    kde2 = stats.gaussian_kde(d2,  bw_method='silverman')
    xs = np.linspace(-10, 10, num=200)

    #plt.plot(data)
    plt.plot(xs, kde1(xs))
    plt.plot(xs, kde2(xs))
    plt.plot(xs, n.pdf(xs), color='k')

    num_bins=100
    h = np.histogram(data, num_bins, density=True)
    plt.plot(h[1][1:], h[0], lw=4)

    h = np.histogram(d2, num_bins, density=True)
    plt.plot(h[1][1:], h[0], lw=4)
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"] + df["hour"] / 24.
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x))
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        return df_new
    logging.info("train kde_opt4 model")
    df_cell_train_feats_kde = prepare_feats(df_cell_train_feats)
    df_cell_test_feats_kde = prepare_feats(df_cell_test_feats)
    n_class = len(np.unique(y_train))
    y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d")
    for i in range(n_class):
        X = df_cell_train_feats_kde[y_train == i]
        y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d")
        for feat in df_cell_train_feats_kde.columns.values:
            X_feat = X[feat].values
            BGK10_output = kdeBGK10(X_feat)
            if BGK10_output is None:
                kde = gaussian_kde(X_feat, "scott")
                kde = gaussian_kde(X_feat, kde.factor * 0.741379)
                y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values)
            else:
                bandwidth, mesh, density = BGK10_output
                kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth)
                kde.fit(X_feat[:, np.newaxis])
                y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis]))
        y_test_pred[:, i] += y_test_pred_i
    return y_test_pred
Beispiel #8
0
def generate_animation(train_filename, test_filename, encrypted_key, sequence):
    pairs = [chr1 + chr2 for chr1 in ALPHABET for chr2 in ALPHABET]
    freq1 = count_bigram_frequency(read_and_simply_text(train_filename))
    freq2 = count_bigram_frequency(encrypt_by_key_substitution(read_and_simply_text(test_filename), encrypted_key))
    # desired key
    dct = dict(zip(pairs, [chr1 + chr2 for chr1 in encrypted_key for chr2 in encrypted_key]))
    data = [math.log(freq1[pair]*freq2[dct[pair]]) for pair in pairs]
    density = gaussian_kde(data)
    xs = np.linspace(0,8,200)
    density.covariance_factor = lambda : .25
    density._compute_covariance()
    plt.ion()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    final = density(xs)
    line1, = ax.plot(xs, final, 'b')
    line2, = ax.plot(xs, final, 'r')

    for key in sequence:
        # inverting the key
        dct = dict(zip(key, ALPHABET))
        key = [dct[chr] for chr in ALPHABET]
        # finding similarity between key
        dct = dict(zip(pairs, [chr1 + chr2 for chr1 in key for chr2 in key]))
        data1 = [math.log(freq1[pair]*freq2[dct[pair]]) for pair in pairs]
        density = gaussian_kde(data1)
        xs = np.linspace(0,8,200)
        density.covariance_factor = lambda : .25
        density._compute_covariance()
        line1.set_ydata(density(xs))
        fig.canvas.draw()
        time.sleep(0.5)
Beispiel #9
0
    def plot_data_comb_2D(self, results_path, file_n, data, fit, timepoints):

        pp = PdfPages(results_path+'/'+file_n)
        cc = 0
        for tp in timepoints:
            xmin, xmax = -3, 3
            ymin, ymax = -3, 3

            xx, yy = mgrid[xmin:xmax:100j, ymin:ymax:100j]
            positions = vstack([xx.ravel(), yy.ravel()])
            values = vstack([ log10(1+data[tp][:, 0]), log10(1+data[tp][:, 1])])
            kernel = st.gaussian_kde(values)
            f = reshape(kernel(positions).T, xx.shape)

            xxf, yyf = mgrid[xmin:xmax:100j, ymin:ymax:100j]
            positions_f = vstack([xxf.ravel(), yyf.ravel()])
            values_f = vstack([log10(1+fit[tp][:, 0]), log10(1+fit[tp][:, 1])])
            kernel_f = st.gaussian_kde(values_f)
            ff = reshape(kernel_f(positions_f).T, xxf.shape)

            ax = plt.subplot(4, 5, cc+1)
            ax.contourf(xx, yy, f, cmap='Blues')
            ax.contourf(xxf, yyf, ff, cmap='Reds')

            ax.set_xlim([-1, 3])
            ax.set_ylim([-1, 3])
            cc += 1
        pp.savefig()
        plt.close()
        pp.close()
Beispiel #10
0
def figures():
    data = np.genfromtxt('examscores.csv')

    sigma2 = 25.
    mu_0 = 80.
    sigma2_0 = 16.
    alpha = 3.
    beta = 50.
    mu_samples, sigma2_samples = gibbs(data, sigma2, mu_0, sigma2_0, alpha, beta)
    
    mukernel = gaussian_kde(mu_samples)

    x_min = min(mu_samples) - 1.
    x_max = max(mu_samples) + 1.
    x = np.arange(x_min, x_max, step=.1)
    plt.plot(x, mukernel(x))
    plt.savefig("mu_posterior.pdf")
    plt.clf()

    sigma2kernel = gaussian_kde(sigma2_samples)

    x_min = min(sigma2_samples) - 1.
    x_max = max(sigma2_samples) + 1.
    x = np.arange(x_min, x_max, step=1.)
    plt.plot(x, sigma2kernel(x))
    plt.savefig("sigma2_posterior.pdf")
    plt.clf()
   
    score_samples = np.array([norm.rvs(mu_sample, np.math.sqrt(sigma2_sample))
                            for mu_sample, sigma2_sample in zip(mu_samples, sigma2_samples)])
    score_kernel = gaussian_kde(score_samples)
    x_min = min(score_samples) - 1.
    x_max = max(score_samples) + 1.
    plt.plot(x, score_kernel(x))
    plt.savefig("predictiveposterior.pdf")
Beispiel #11
0
def add_kernel_density_estimate(data, graph=None, filename=None):
    data = np.array(data)
    fig,axis_eruptions,axis_waiting = None,None,None
    if graph is None:
        fig, (axis_eruptions,axis_waiting) = pyplot.subplots(1,2,sharex=False,sharey=False)
        _set_axis_properties(axis_eruptions,axis_waiting)
    else:
        fig,axis_eruptions,axis_waiting = graph
        axis_eruptions = axis_eruptions.twinx()
        axis_waiting = axis_waiting.twinx()
        
    fig.subplots_adjust(wspace=0.5)
    axis_eruptions.set_ylabel('Density')
    axis_waiting.set_ylabel('Density')
    
    density_eruptions = stats.gaussian_kde(data[:,0])
    density_waiting = stats.gaussian_kde(data[:,1])
    
    x_eruptions = np.arange(0.,6,0.05)
    axis_eruptions.plot(x_eruptions, density_eruptions(x_eruptions),'k-')
    
    x_waiting = np.arange(40.,100.,0.5)
    axis_waiting.plot(x_waiting, density_waiting(x_waiting), 'k-')
    
    if filename is not None:
        fig.savefig(filename)
Beispiel #12
0
    def _make_kde(self, conf=0.95):

        self.durkde = gaussian_kde(self.durs)
        self.depthkde = gaussian_kde(self.deps)
        self.slopekde = gaussian_kde(self.slopes)
        self.logdepthkde = gaussian_kde(self.logdeps)

        if self.fit_converged:
            try:
                durconf = kdeconf(self.durkde, conf)
                depconf = kdeconf(self.depthkde, conf)
                logdepconf = kdeconf(self.logdepthkde, conf)
                slopeconf = kdeconf(self.slopekde, conf)
            except:
                raise
                raise MCMCError("Error generating confidence intervals...fit must not have worked.")

            durmed = np.median(self.durs)
            depmed = np.median(self.deps)
            logdepmed = np.median(self.logdeps)
            slopemed = np.median(self.slopes)

            self.durfit = (durmed, np.array([durmed - durconf[0], durconf[1] - durmed]))
            self.depthfit = (depmed, np.array([depmed - depconf[0], depconf[1] - depmed]))
            self.logdepthfit = (logdepmed, np.array([logdepmed - logdepconf[0], logdepconf[1] - logdepmed]))
            self.slopefit = (slopemed, np.array([slopemed - slopeconf[0], slopeconf[1] - slopemed]))

        else:
            self.durfit = (np.nan, (np.nan, np.nan))
            self.depthfit = (np.nan, (np.nan, np.nan))
            self.logdepthfit = (np.nan, (np.nan, np.nan))
            self.slopefit = (np.nan, (np.nan, np.nan))

        points = np.array([self.durs, self.logdeps, self.slopes])
        self.kde = gaussian_kde(points)
Beispiel #13
0
    def summary_stats(self, data):
        """Returns tuple containing summary statistics named in summary_stat_names
        """
        if data is None:
            return [np.nan]*len(self.summary_stat_names)

        N = len(data)
        min_logP, max_logP = np.log(self.min_period), np.log(self.max_period)
        logP_grid = np.linspace(min_logP, max_logP, 1000)
        if N > 1:
            k = gaussian_kde(np.log(data.period.values))
            logP_pdf = k(logP_grid)
        else:
            logP_pdf = np.ones(len(logP_grid))*1./(max_logP - min_logP)

        logd_grid = np.linspace(-4, 0, 1000)
        if N > 1:
            k = gaussian_kde(data.logd_pri)
            logd_pdf = k(logd_grid)
        else:
            logd_pdf = np.ones(len(logd_grid))*1./(4)

        phase_sec = data.phase_sec.dropna().values
            
        return logP_pdf, N, phase_sec, logd_pdf
    def update(self, mu, weight):
        assert 0 <= weight <= 1

        if weight == 1:
            self.definite_points.append(mu)
        else:
            self.possible_points.append((mu, weight))

        # just keep the PRIOR distribution
        if self.definite_points == []:
            return

        if self.possible_points != []:
            # turn into an array for numpy's purposes
            a = np.array(self.possible_points)

            # want to keep if weight is greater than random number
            mask = a[:, 1] > np.random.rand(len(self.possible_points))
            sampled = a[:, 0][mask]

            if (sampled != []):
                points = np.concatenate((np.array(self.definite_points), np.array(sampled)))
            else:
                points = np.array(self.definite_points)
        else:
            points = np.array(self.definite_points)

        if points.size > 1:
            #print "points: " + str(points)
            self.distribution = gaussian_kde(points)
        else:
            # THIS IS AN UGLY HACK, need 2 pts initially
            points = np.array([points[0] - OFFSET, points[0] + OFFSET])
            self.distribution = gaussian_kde(points)
            self.distribution = None
Beispiel #15
0
def lookatresults(data, modes, theta=None, vert=False, labels=None):


    P = data[-1][0]
    n = P.shape[0]

    if labels == None:
        labels = [""] * n
    else:
        pass 

    if vert == True:
        subplots = range(n*100+11,n*100+n+11,1)
        figsize = (6, 3*n)
    elif vert == 'four':
        subplots = [221, 222, 223, 224]
        figsize = (10, 10)
    else:
        subplots = range(100+n*10+1,100+n*10+1+n,1)
        figsize = (5*n, 3)

    f = stats.gaussian_kde(data[-1][0])
    int_guess = np.mean(data[-1][0], axis=1)
    modes = minimize(neg, int_guess, args=(f)).x

    thetas = []
    P = data[-1][0]
    labelpad = 20

    for i in xrange(n):
        x = P[i]
        t = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format(
            modes[i]-stats.scoreatpercentile(x, 16),
            modes[i],
            stats.scoreatpercentile(x, 84)-modes[i], i+1)

        thetas.append(t)

    if P.shape[1] > 10:
        bins = np.sqrt(P.shape[1])
    else:
        bins=10
    fig = plt.figure(figsize=figsize)
    
    for i in xrange(n):
        print subplots[i]
        plt.subplot(int(subplots[i]))
        #plt.title(thetas[0])
        ker = stats.gaussian_kde(P[i])
        h = plt.hist(P[i], bins=bins, normed=True, alpha=1)
        x = np.linspace(h[1][0],h[1][-1],1000)
        plt.plot(x,ker(x))
        plt.xlabel(labels[i], labelpad=labelpad, fontsize=24)
        if theta != None:
            plt.axvline(theta[0])

    for t in thetas: 
        print t

    return fig
Beispiel #16
0
def test(ens, x_train, y_train, train_spread, x_test, y_test, test_spread, moneyline):
    # find training error
    plot = False
    # Currently moneyline is set to false.  Not sure how to set to true
    ens.predict(x_train, train=True)
    a = ens.blend()
    ens.validate(y_train, train_spread, False, moneyline)

    ens.predict(x_test, train=False)
    b = ens.blend()
    c = ens.validate(y_test, test_spread, True, moneyline)

    if plot:
        density = gaussian_kde(a)
        xs = np.linspace(-20, 20, 200)
        density.covariance_factor = lambda: 0.1
        density._compute_covariance()
        plt.plot(xs, density(xs))

        density = gaussian_kde(b)
        xs = np.linspace(-20, 20, 200)
        density.covariance_factor = lambda: 0.1
        density._compute_covariance()
        plt.plot(xs, density(xs))
    return c
def kde_minmode(data,x,max_num_mode,min_mode_pdf):
    kde=gaussian_kde(data)
    f=kde.factor
    f_list=np.linspace(f,(data.max()-data.min()),100)
    s=UnivariateSpline(x,kde(x),s=0)
    s1=UnivariateSpline(x,s(x,1),s=0)
    s2=UnivariateSpline(x,s1(x,1),s=0)
    extrema=s1.roots()
    
    maxima=extrema[np.where((s2(extrema)<0)*(s(extrema)>=min_mode_pdf))]
    
    if len(maxima)>max_num_mode:
        for q in range(1,len(f_list)):
            f=f_list[q]
            kde2=gaussian_kde(data,bw_method=f)
            s=UnivariateSpline(x,kde2(x),s=0)
            s1=UnivariateSpline(x,s(x,1),s=0)
            s2=UnivariateSpline(x,s1(x,1),s=0)
            extrema=s1.roots()
            maxima=extrema[np.where((s2(extrema)<0)*(s(extrema)>=min_mode_pdf))]
            if len(maxima)<=max_num_mode:
##                print 'modes: ',maxima
                break
        kde=gaussian_kde(data,bw_method=f)
##    else:
##        print maxima

    return kde,maxima
Beispiel #18
0
def pdfcalcs(x_pred, x_hist, y_hist):
    """Calculates the PDFs required to calculate transfer entropy.

    Currently only supports k = 1; l = 1

    """
    # TODO: Generalize for k and l

    # Get dimensions of vectors
#    k = np.size(x_hist[:, 1])
#    l = np.size(y_hist[:, 1])

    # Calculate p(x_{i+h}, x_i, y_i)
    data_1 = np.vstack([x_pred, x_hist[0, :], y_hist[0, :]])
    pdf_1 = stats.gaussian_kde(data_1, 'silverman')

    # Calculate p(x_i, y_i)
    data_2 = np.vstack([x_hist[0, :], y_hist[0, :]])
    pdf_2 = stats.gaussian_kde(data_2, 'silverman')

    # Calculate p(x_{i+h}, x_i)
    data_3 = np.vstack([x_pred, x_hist[0, :]])
    pdf_3 = stats.gaussian_kde(data_3, 'silverman')

    # Calculate p(x_i)
    data_4 = x_hist[0, :]
    pdf_4 = stats.gaussian_kde(data_4, 'silverman')

    return pdf_1, pdf_2, pdf_3, pdf_4
Beispiel #19
0
def traceplot(traces, thin, burn):
    '''
    Plot parameter estimates for different levels of the model
    into the same plots. Black lines are individual observers
    and red lines are mean estimates.
    '''
    variables = ['Slope1', 'Slope2', 'Offset', 'Split']
    for i, var in enumerate(variables):
        plt.subplot(2, 2, i + 1)
        vals = get_values(traces, var, thin, burn)
        dim = (vals.min() - vals.std(), vals.max() + vals.std())
        x = plt.linspace(*dim, num=1000)
        for v in vals.T:
            a = gaussian_kde(v)
            y = a.evaluate(x)
            y = y / y.max()
            plt.plot(x, y, 'k', alpha=.5)
        try:
            vals = get_values(traces, 'Mean_' + var, thin, burn)
            a = gaussian_kde(vals)
            y = a.evaluate(x)
            y = y / y.max()
            plt.plot(x, y, 'r', alpha=.75)
        except KeyError:
            pass
        plt.ylim([0, 1.1])
        plt.yticks([0])
        sns.despine(offset=5, trim=True)
        plt.title(var)
Beispiel #20
0
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'):
    """ Set the plx_kde

    Parameters
    ----------
    t : ndarray float
        Catalog of parallax measures (units: mas)
    bandwidth : float
        Bandwidth for gaussian_kde (optional, 0.01 recommended)
    method : string
        Method for density determination (options: scipy_kde, sklearn_kde, blocks)
    """

    global plx_kde

    if method is 'scipy_kde':

        if plx_kde is None:
            # We are only going to allow parallaxes above some minimum value
            if bandwidth is None:
                plx_kde = gaussian_kde(t['plx'][t['plx']>0.0])
            else:
                plx_kde = gaussian_kde(t['plx'][t['plx']>0.0], bw_method=bandwidth)

    elif method is 'sklearn_kde':
        if plx_kde is None:
            kwargs = {'kernel':'tophat'}
            if bandwidth is None:
                plx_kde = KernelDensity(**kwargs)
            else:
                plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs)

            if c.kde_subset:
                plx_ran = np.copy(t['plx'][t['plx']>0.0])
                np.random.shuffle(plx_ran)
                plx_kde.fit( plx_ran[0:5000, np.newaxis] )
            else:
                plx_kde.fit( t['plx'][t['plx']>0.0][:, np.newaxis] )

    elif method is 'blocks':
        global plx_bins_blocks
        global plx_hist_blocks

        # Set up Bayesian Blocks
        print("Calculating Bayesian Blocks...")
        nbins = np.min([len(t), 40000])
        bins = bayesian_blocks(t['plx'][t['plx']>0.0][0:nbins])
        hist, bins = np.histogram(t['plx'][t['plx']>0.0][0:nbins], bins=bins, normed=True)

        # Pad with zeros
        plx_bins_blocks = np.append(-1.0e100, bins)
        hist_pad = np.append(0.0, hist)
        plx_hist_blocks = np.append(hist_pad, 0.0)
        print("Bayesian Blocks set.")

    else:
        print("You must include a valid method")
        print("Options: kde or blocks")
        return
Beispiel #21
0
def lookatresults(data, name, modes):
    plots, thetas = [], []
    P = data[-1][0]

    for i in xrange(len(P)):
        x = P[i]
        theta = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format(
            modes[i]-stats.scoreatpercentile(x, 16),
            modes[i],
            stats.scoreatpercentile(x, 84)-modes[i], i+1)



        thetas.append(r'$\theta_{}$'.format(i+1))
        f = plt.figure()
        plt.suptitle(name)
        plt.subplot(111)
        plt.title(theta)
        ker = stats.gaussian_kde(x)
        plt.hist(x, normed=True, alpha=0.2)
        X = np.linspace(0.0, max(x) + .1*max(x), 1000)
        plt.plot(X,ker(X))
        plt.xlabel(r"$\theta_{}$".format(i+1))
        #plt.savefig('theta_{}.png'.format(i))
        plots.append(f)


    f = plt.figure()
    plt.subplot(211)
    plt.plot(data['epsilon'], 'o-')
    plt.title(r'$\epsilon$')
    plt.subplot(212)
    plt.plot(data['n total'], 'o-')
    plt.title('N Trials')
    plots.append(f)


    alphas = np.linspace(0, 1, data.size)

    for j in xrange(len(data[0][0])):
        f = plt.figure()
        for i, D in enumerate(data):
            F = stats.gaussian_kde(D[0][j])
            x = np.linspace(D[0][j].min(), D[0][j].max(), 300)
            plt.plot(x, F(x), alpha=alphas[i])
            plt.xlabel(r"$\theta_{}$".format(j+1))
            if i == data.size - 1:
                plt.plot(x, F(x), c='m', ls='--', lw=2, zorder=1)

        plots.append(f)


    plt.figure()
    f = triangle.corner(P.T, labels=thetas)
    #plt.savefig('trianle.png'.format(i))
    plots.append(f)

    return plots
Beispiel #22
0
def bootStrap( lofvars, homvars, tlength, targetdir, prefix ) :
    samplings = []
    ind = np.linspace(0,100,512)
    kde = gaussian_kde( homvars[homvars.PSI != "-"].PSI.map(float).tolist() )
    for boot in range(0,1000) :
        kdesub = gaussian_kde( kde.resample(tlength) )
        kdedf = DataFrame( {"subsetname":"Random%d" % boot, 
                          "Density":kdesub.evaluate(ind), "PSI":ind} )
        samplings.append( kdedf )

    samplings = concat( samplings ).reset_index(drop=True)

    quants = samplings.groupby(["PSI"])["Density"].quantile([.025,.5,.975]).reset_index()
    quants.rename(columns={'level_1':"Quantile",0:"Density"}, inplace=True )
    quants["linetype"] = ["Mean" if x == .5 else "95% threshold" for x in quants.Quantile]

    if "vclass" not in lofvars.columns.tolist() : 
        lofvars["vclass"] == "LoF"

    lofvars_sub = lofvars[lofvars.PSI != "-"].copy()
    lofvars_sub.PSI = lofvars_sub.PSI.astype(float)
    lofdf = []
    for vclass,lofclass in lofvars_sub.groupby("vclass") :
        kde = gaussian_kde( lofvars_sub[lofvars_sub.vclass == vclass].PSI.tolist() )
        tmpdf = DataFrame({"vclass":vclass, "Density":kde.evaluate(ind), "PSI":ind})
        lofdf.append( tmpdf )

    lofdf = concat( lofdf ).reset_index(drop=True)

    rsamplings = com.convert_to_r_dataframe(samplings)
    rlofvars = com.convert_to_r_dataframe(lofdf)
    rquants = com.convert_to_r_dataframe(quants)
    rquants = fixRLevels( rquants,"linetype", ["Mean","95% threshold"] )
    #r_pvals = com.convert_to_r_dataframe(pvals)
    p = (ggplot2.ggplot(rlofvars) +
                ggplot2.aes_string(x="PSI",y="Density") + #,group="vclass"
                ggplot2.geom_line( ggplot2.aes_string(x="PSI", y="Density", group="factor(subsetname)"),
                                  color="grey", data=rsamplings ) +
                ggplot2.geom_line( ggplot2.aes_string(x="PSI",y="Density",linetype="factor(linetype)", 
                                                      group="factor(Quantile)"),
                                  color="black", data=rquants ) +
                ggplot2.geom_line( ggplot2.aes_string(color="factor(vclass)") ) +
                #ggplot2.geom_density(ggplot2.aes_string(colour="factor(vclass)"),size=1.5,color="blue") +
                ggplot2.scale_y_continuous("Density") +
                #ggplot2.scale_x_continuous("PSI") +
                ggplot2.scale_linetype("Confidence Interval") +
                ggplot2.scale_colour_brewer("Variant Type",palette="Set1") +
                #ggplot2.theme(**{'legend.position':"none"}) +
                #ggplot2.ggtitle("PSI distribution") +
                #ggplot2.scale_colour_brewer("Variant Type",palette="Set1") +
                #ggplot2.scale_x_discrete("ME AF") +
                ggplot2.theme(**mytheme) )
                #ggplot2.stat_smooth(method="lm", se=False)+
    figname = "%s/%s_psibootstrap2.pdf" % (targetdir,prefix)
    print "Writing file:",figname
    grdevices.pdf(figname, width=5, height=4)
    p.plot()
    grdevices.dev_off()
Beispiel #23
0
def test_weights_intact():
    # regression test for gh-9709: weights are not modified
    np.random.seed(12345)
    vals = np.random.lognormal(size=100)
    weights = np.random.choice([1.0, 10.0, 100], size=vals.size)
    orig_weights = weights.copy()

    stats.gaussian_kde(np.log10(vals), weights=weights)
    assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
def violin_plot(bl, dL, ax, clr='blue', alpha=.75, percentiles=[25, 75], corrections=False, **kwargs):
    """Creates a violin plot and sets properties."""
    hl, x, y = hist_axis(bl, dL, **kwargs)
    y2 = np.array([t['med'] for t in hl])
    x2 = np.array([s['sliceMed'] for s in hl])
    ind = ((y > 0) & (x > 0)) | ((y < 0) & (x < 0))
    y = y[ind]
    x = x[ind]
    lim = max(np.max(np.abs(x2)), np.max(np.abs(y2)))*1.1
    dataset = [s['data'] for s in hl]
    p_data = [np.percentile(s, percentiles) for s in dataset]
    refined_dataset = [x[((x > p[0]) & (x < p[1]))] for x, p in zip(dataset, p_data)]
    violin_widths = .6*(max(x2) - min(x2))/len(y2)

    violins = ax.violinplot(refined_dataset, widths=violin_widths, showmedians=True, showextrema=False, positions=x2)

    ax.set_xlim(-lim, lim)
    ax.set_ylim(-lim, lim)
    ax.set(adjustable='box-forced', aspect='equal')
    add_identity(ax, color='.3', ls='-', linewidth=2, zorder=1)

    for vio in violins['bodies']:
        vio.set(facecolor=clr, alpha=alpha)

    violins['cmedians'].set(edgecolor='red')
    violins['cmedians'].set_linewidth(2.5)

    if corrections:
        total_kernel = gaussian_kde(y)
        xspace = np.linspace(np.nanmin(x), np.nanmax(x), 1000)
        total_kernel_array = total_kernel.evaluate(xspace)
        gaussian_means = []
        gaussian_stds = []
        for bin in hl:
            bin_kernel = gaussian_kde(bin['data'])
            bin_kernel_array = bin_kernel.evaluate(xspace)
            divided_dist = bin_kernel_array/np.sqrt(total_kernel_array)
            max_ind = np.argmax(bin_kernel_array)
            ind_valid = (divided_dist < 1e10)
            # ind_valid = (divided_dist < divided_dist[max_ind])
            try:
                popt = scipy.optimize.least_squares(gaussian_func,
                                                    [np.abs(bin['sliceMed']), np.abs(bin['sliceMed'] / 5),
                                                     bin['sliceMed']],
                                                    args=(xspace[ind_valid], divided_dist[ind_valid]),
                                                    jac='3-point', x_scale='jac', loss='soft_l1', f_scale=.1).x
                # popt, pcov = curve_fit(gaussian, xspace[ind_valid], divided_dist[ind_valid],
                #     p0=[np.abs(bin['sliceMed']), np.abs(bin['sliceMed']/5), bin['sliceMed']], maxfev=10000)
            except:
                popt = np.array([np.nan, np.nan, np.nan])
            gaussian_means.append(popt[2])
            gaussian_stds.append(popt[1])
        ax.plot(x2, gaussian_means, color='black', linestyle='None', marker='.', ms=15, alpha=.6, zorder=10)
        for std, mean, x_pos in zip(gaussian_stds, gaussian_means, x2):
            ax.plot([x_pos, x_pos], [mean + std, mean - std], 'k-', alpha=.6, zorder=10)

    return hl
def plot_accuracy(counts, labels, ntrain, cpu):
    oaa = sum(counts) / len(counts)
    cotr = counts[ntrain > 0]
    acc = sum(cotr) / len(cotr)
    cput = sum(cpu) / 60

    print('\nOverall accuracy: %.2f' % oaa)
    print('Accuracy given at least 1 training sample: %.2f' % acc)
    print('CPU time: %0.2f min' % cput)

    d = {}
    for lbl, cor, ntr, in zip(labels, counts, ntrain):
        if lbl in d:
            d[lbl][0].append(cor)
            d[lbl][1].append(ntr)
        else:
            d[lbl] = ([cor], [ntr])

    print('')

    cte = np.array([sum(x[0]) for x in d.values()])
    num = np.array([len(x[0]) for x in d.values()])
    y = cte / num
    x = np.array([x[1][0] for x in d.values()])
    x2 = np.array([len(x) for x in d])

    fig = plt.figure()
    ax = fig.add_subplot(121)
    # ax.plot(x, y, '.')
    xy = np.vstack([x, y])
    z = gaussian_kde(xy)(xy)
    sc = ax.scatter(x, y, c=z, s=100, edgecolor='')
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlim([-5, max(x) + 5])
    plt.grid()
    plt.xlabel('# training samples (for given label)')
    plt.ylabel('accuracy')
    cbar = plt.colorbar(sc)
    cbar.ax.set_ylabel('label density')
    # cbar.set_ticks([0, 0.25, 0.5, 0.75, 1])
    # cbar.set_ticklabels(['0', '0.25', '0.5', '0.75', '1'], update_ticks=True)

    ax2 = fig.add_subplot(122)
    xy2 = np.vstack([x2, y])
    z2 = gaussian_kde(xy2)(xy2)
    sc2 = ax2.scatter(x2, y, c=z2, s=100, edgecolor='')
    ax2.set_ylim([-0.05, 1.05])
    ax2.set_xlim([-5, max(x2) + 5])
    plt.grid()
    plt.xlabel('word length')
    plt.ylabel('accuracy')
    cbar = plt.colorbar(sc2)
    cbar.ax.set_ylabel('label density')

    plt.show()
Beispiel #26
0
    def fit(self, X, y):

        def jitter(x, range):
            y = np.copy(x)
            scale_exp_min = np.abs(np.ceil(np.log10(range[0])))
            scale_exp_max = np.abs(np.ceil(np.log10(range[1])))
            scale_exp = (scale_exp_max + scale_exp_min) / 2.
            r = np.random.rand(y.size) / (10**scale_exp)
            y = y + r
            return y

        # Print msg. when going into gcp.fit
        strMessage = "rows in X = %d, r_minimum = %d" % (X.shape[0], self.r_minimum)
        logger.debug(strMessage)

        # Use X and y to train a Gaussian Copula Process.
        super(GCP, self).fit(X, y)

        # skip training the process if there aren't enough samples
        if X.shape[0] < self.r_minimum:
            return

        # -- Non-parametric model of 'y', estimated with kernel density
        kernel_pdf = st.gaussian_kde(y)
        kernel_cdf = make_cdf(kernel_pdf)
        kernel_ppf = make_ppf(kernel_pdf)
        y_kernel_model = {'pdf': kernel_pdf, 'cdf': kernel_cdf, 'ppf': kernel_ppf}
        self.y_kernel_model = y_kernel_model

        # - Transform y-->F-->vF-->norm.ppf-->v
        vF = y_kernel_model['cdf'](y)
        v = st.norm.ppf(vF)

        # -- Non-parametric model of each feature in 'X', estimated with kernel density
        X_kernel_model = []
        for ki in range(X.shape[1]):
            columnX = X[:, ki]
            if self.tunables[ki][1].is_integer:
                columnX = jitter(columnX, self.tunables[ki][1].range)
            kernel_pdf = st.gaussian_kde(columnX)
            kernel_cdf = make_cdf(kernel_pdf)
            kernel_ppf = make_ppf(kernel_pdf)
            kernel_model = {'pdf': kernel_pdf, 'cdf': kernel_cdf, 'ppf': kernel_ppf}
            X_kernel_model.append(kernel_model)
        self.X_kernel_model = X_kernel_model

        # -- Transform X-->F-->uF-->norm.ppf-->U
        U = np.empty_like(X)
        for ki in range(X.shape[1]):
            uF = X_kernel_model[ki]['cdf'](X[:, ki])
            U[:, ki] = st.norm.ppf(uF)

        # - Instantiate a GP and fit it with (U, v)
        self.gcp = GaussianProcessRegressor(normalize_y=True)
        self.gcp.fit(U, v)
Beispiel #27
0
def plot_rmse(working_directory):

    figure_dir = os.path.join(working_directory, 'Figures')
    if not os.path.exists(figure_dir):
        os.makedirs(figure_dir, exist_ok=True)

    x_grid = np.arange(0, 360, 10)
    correct = star.get_EAs_from_star(os.path.join(
        working_directory, 'exp_projections.star'))

    plt.figure(0)
    first = star.get_EAs_from_star(os.path.join(
        working_directory, 'it000', 'orientations.star'))
    correct_rmse = calc_rmse(correct, first)
    # plt.hist(correct_rmse)]
    correct_kde = gaussian_kde(correct_rmse)
    plt.plot(x_grid, correct_kde.evaluate(x_grid))
    # plt.ylim([0, 1])
    plt.xlabel('RMSE for 3 Euler angles')
    plt.ylabel('Count')
    plt.title('Compare with correct angle distribution')
    plt.savefig(os.path.join(figure_dir, 'it000'), dpi=150)

    exp_folder = glob.glob(os.path.join(working_directory, 'it*'))
    last = star.get_EAs_from_star(os.path.join(exp_folder[0], 'orientations.star'))
    exp_folder.pop(0)

    for i, folder in enumerate(exp_folder, start=1):
        now = star.get_EAs_from_star(os.path.join(
            folder, 'orientations.star'))
        correct_rmse = calc_rmse(correct, now)
        last_rmse = calc_rmse(last, now)
        last = now

        fig = plt.figure(num=i, figsize=(16, 6))
        gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])
        plt.suptitle('Iteration: {0}'.format(i))
        plt.subplot(gs[0])
        # plt.hist(correct_rmse)
        correct_kde = gaussian_kde(correct_rmse)
        plt.plot(x_grid, correct_kde.evaluate(x_grid))
        plt.xlabel('RMSE for 3 Euler angles')
        plt.ylabel('Count')
        plt.title('Compare with correct angle distribution')
        plt.subplot(gs[1])
        # plt.hist(last_rmse)
        last_kde = gaussian_kde(last_rmse)
        plt.plot(x_grid, last_kde.evaluate(x_grid))
        plt.xlabel('RMSE for 3 Euler angles')
        plt.ylabel('Count')
        plt.title('Compare with angle distribution of last iteration')
        plt.savefig(os.path.join(figure_dir, 'it' + str(i).zfill(3)),
                    dpi=150, bbox_inches='tight')
        plt.close(fig)
Beispiel #28
0
def test_weights_integer():
    # integer weights are OK, cf gh-9709 (comment)
    np.random.seed(12345)
    values = [0.2, 13.5, 21.0, 75.0, 99.0]
    weights = [1, 2, 4, 8, 16]  # a list of integers
    pdf_i = stats.gaussian_kde(values, weights=weights)
    pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))

    xn = [0.3, 11, 88]
    assert_allclose(pdf_i.evaluate(xn),
                    pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
def plotAffVersusUnaff(intronret, affset, unaffset, figuredir, gfftype, dtype):
    print "Running plotAffVersusUnaff"
    inretmat = intronret.loc[:, affset+unaffset]

    maxx = intronret["logRII"].max()
    AvUsamplings = []  # List of density distributions for each comparison
    ind = np.linspace(intronret["logRII"].min(), maxx, 512)

    for aff in affset:
        for unaff in unaffset:
            print "A:", aff, "versus", "U:", unaff
            newriivals = inretmat.apply(lambda row:
                                        calcRII(row, [aff], [unaff]), axis=1)
            kdesub = gaussian_kde(newriivals)
            kdedf = pd.DataFrame({"subsetname": aff+" vs "+unaff,
                                  "Affected": aff, "Unaffected": unaff,
                                  "Density": kdesub.evaluate(ind),
                                  "logRII": ind})
            AvUsamplings.append(kdedf)

    AvUsamplings = pd.concat(AvUsamplings).reset_index(drop=True)

    # Calculate an Observed density using the original data
    newriivals = inretmat.apply(lambda row:
                                calcRII(row, affset, unaffset), axis=1)
    kde = gaussian_kde(newriivals)
    obsdf = pd.DataFrame({"vclass": "Observed", "Density": kde.evaluate(ind),
                          "logRII": ind})

    rsamplings = com.convert_to_r_dataframe(AvUsamplings)
    # robsdf = com.convert_to_r_dataframe(obsdf)
    p = (ggplot2.ggplot(rsamplings) +
         ggplot2.aes_string(x="logRII", y="Density",
                            group="factor(subsetname)") +
         ggplot2.geom_vline(xintercept=0, linetype="dashed") +
         ggplot2.geom_hline(yintercept=0, linetype="solid") +
         ggplot2.geom_line(ggplot2.aes_string(color="factor(Unaffected)")) +
         ggplot2.scale_y_continuous("Density") +
         ggplot2.scale_x_continuous("Log RII") +
         ggplot2.scale_colour_brewer("Unaffected", palette="Set1") +
         ggplot2.facet_wrap(robjects.Formula('~ Affected'), ncol=3) +
         ggplot2.theme(**sitefreqtheme) +
         ggplot2.theme(**{'legend.position': "right"}))
    # ggplot2.geom_line( ggplot2.aes_string(x="logRII",y="Density",
    # group="factor(vclass)") +
    # linetype="dashed", color="black", data=robsdf ) +
    # ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) +

    figname = os.path.join(figuredir, gfftype+"_"+dtype+"_logirr_AvU.pdf")
    print "Writing file:", figname
    grdevices.pdf(figname, width=10, height=8)
    p.plot()
    grdevices.dev_off()
Beispiel #30
0
def si_lam(overlaps):

	X_si_wEM 	= [(a.lam, a.fp) for a,b in overlaps] + [(b.lam, b.fp) for a,b in overlaps]
	X_si_lam 	= [(a.si, a.lam) for a,b in overlaps] + [(b.si, b.lam) for a,b in overlaps]
	X_lam_wEM 	= [(a.lam, a.wEM) for a,b in overlaps] + [(b.lam, b.wEM) for a,b in overlaps]
	X_pi_wEM 	= [(a.si, a.fp) for a,b in overlaps] + [(b.si, b.fp) for a,b in overlaps]
	

	F 			= plt.figure(figsize=(15,10))
	
	ax1 		= F.add_subplot(2,2,1)
	x,y 		= [math.log(x,10) for x,y in X_si_wEM],[y for x,y in X_si_wEM]
	x,y 		= [x for x,y in X_si_wEM],[y for x,y in X_si_wEM]
	xy 			= np.vstack([x,y])
	z 			= gaussian_kde(xy)(xy)
	ax1.scatter(x, y, c=z, s=14, edgecolor='')
	ax1.set_xlabel("Variance in Loading")
	ax1.set_ylabel("Probability of Paused")
	
	ax1.grid()

	
	ax2 		= F.add_subplot(2,2,2)
	x,y 		= [math.log(x,10) for x,y in X_si_lam],[math.log(y,10) for x,y in X_si_lam]
	x,y 		= [x for x,y in X_si_lam],[y for x,y in X_si_lam]
	xy 			= np.vstack([x,y])
	z 			= gaussian_kde(xy)(xy)
	ax2.set_xlabel("Variance in Loading")
	ax2.set_ylabel("Length of Initiation")
	ax2.scatter(x, y, c=z, s=14, edgecolor='')
	ax2.grid()

	ax3 		= F.add_subplot(2,2,3)
	x,y 		= [math.log(x,10) for x,y in X_lam_wEM],[y for x,y in X_lam_wEM]
	xy 			= np.vstack([x,y])
	z 			= gaussian_kde(xy)(xy)
	ax3.set_xlabel("Length of Initiation")
	ax3.set_ylabel("Probability of Paused")
	ax3.scatter(x, y, c=z, s=14, edgecolor='')
	ax3.grid()

	ax4 		= F.add_subplot(2,2,4)
	x,y 		= [x for x,y in X_pi_wEM],[y for x,y in X_pi_wEM]
	xy 			= np.vstack([x,y])
	z 			= gaussian_kde(xy)(xy)
	
	ax4.set_xlabel("Strand Probability")
	ax4.set_ylabel("Probability of Paused")
	ax4.scatter(x, y, c=z, s=14, edgecolor='')
	ax4.grid()

	plt.show()
Beispiel #31
0
def plot_kmap(data,
              data_raw=True,
              as_partitions=None,
              data_label="",
              filename="",
              plot_annotation=True,
              annotation_params=None,
              title=None,
              title_loc="center",
              titlelabelsize=26,
              axlabelsize=22,
              textsize=16,
              annotationsize=13,
              tail_threshold=None,
              plot_legend=False,
              plot_scatter=True,
              scatter_ms=None,
              scatter_c='k',
              scatter_a=.5,
              scatter_m=r'.',
              plot_heatmap=True,
              colormap=plt.cm.Greys,
              plot_contour=False,
              plot_contour_lbls=False,
              max_val_exp=5):
    # Plot basic setup
    matplotlib.rcParams.update({'font.size': textsize})
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    if title != None:
        plt.title(title, loc=title_loc, fontdict={'fontsize': titlelabelsize})
    ax.set_xlabel("Anonymity Set Size ($k$)")
    ax.set_ylabel("Num. Anonymity Sets at Size of $k$")
    plt.tick_params(axis='both', which='major', labelsize=axlabelsize)

    # Process data
    if data_raw:
        # Assumed that anonymity sets partition the dataset
        data_length = len(data)
        xy = Counter(Counter(data).values())
        x = [x_ for x_ in sorted(xy.keys())]
        y = [xy[ass] for ass in sorted(xy.keys())]
        if as_partitions == None or as_partitions == True:
            z = [ass * xy[ass] for ass in sorted(xy.keys())]
            w = [
                float(ass * xy[ass]) / data_length for ass in sorted(xy.keys())
            ]
        else:
            z = [xy[ass] for ass in sorted(xy.keys())]
            w = [float(xy[ass]) / data_length for ass in sorted(xy.keys())]
    else:
        # Not assumed that anonymity sets partition the dataset (e.g., they could be overlapping)
        data_length = data[0]
        xy = data[1]
        x = [x_ for x_ in sorted(xy.keys())]
        y = [xy[ass] for ass in sorted(xy.keys())]
        if as_partitions == None or as_partitions == False:
            z = [xy[ass] for ass in sorted(xy.keys())]
            w = [float(xy[ass]) / data_length for ass in sorted(xy.keys())]
        else:
            z = [ass * xy[ass] for ass in sorted(xy.keys())]
            w = [
                float(ass * xy[ass]) / data_length for ass in sorted(xy.keys())
            ]

    if plot_heatmap or plot_contour:
        # Emphasize heavy spots for the contour (but visualize only one for each)
        x_ = []
        y_ = []
        z_ = []
        for ix in range(len(z)):
            for i in range((z[ix])):
                if i == 0:
                    if scatter_ms == None:
                        z_.append(float(10000 * z[ix]) / data_length)
                    else:
                        z_.append(scatter_ms)
                else:
                    z_.append(0.0)

                x_.append(math.log(x[ix], 10))
                y_.append(math.log(y[ix], 10))

        # Heatmap calculation
        X, Y = np.mgrid[-0.5:5:100j, -0.5:5:100j]
        positions = np.vstack([X.ravel(), Y.ravel()])
        values = np.vstack([x_, y_])
        kernel = gaussian_kde(values)
        Z = np.reshape(kernel(positions).T, X.shape)
        Z = np.sqrt(np.sqrt(Z))  # Strengthen low-weighted regions

        # Plot heatmap
        if plot_heatmap:
            plt.contourf(X, Y, Z, 10, cmap=colormap, alpha=.5)

        # Plot contour
        if plot_contour:
            cs = plt.contour(X, Y, Z, 10, cmap=colormap, alpha=.5)
            if plot_contour_lbls:
                plt.clabel(cs, inline=1, fontsize=int(textsize / 2))

    if plot_scatter:
        # Scatter points
        plt.scatter([math.log(_, 10) for _ in x], [math.log(_, 10) for _ in y],
                    s=[10**4 * _ for _ in w],
                    alpha=scatter_a,
                    c=scatter_c,
                    marker=scatter_m,
                    label=data_label)  # alpha=.5,

        if plot_legend:
            # Legend
            lgnd = plt.legend(loc="upper right",
                              fontsize=textsize)  # , numpoints=1
            lgnd.legendHandles[0]._sizes = [30]

    # Select groups of datapoints
    if isinstance(plot_annotation, list):
        grps = [[] for _ in range(len(plot_annotation))]
        weights = [0.0 for _ in range(len(plot_annotation))]

        for ix in range(len(x)):
            for gix in range(len(plot_annotation)):
                grp = plot_annotation[gix]
                if x[ix] >= min(grp) and x[ix] <= max(grp):
                    grps[gix].append([x[ix], y[ix]])
                    weights[gix] += w[ix]

        for gix, grp in enumerate(grps):
            if len(grp) == 0:
                continue

            annotation_radius = .1
            if isinstance(annotation_params,
                          dict) and 'radius' in annotation_params:
                if isinstance(annotation_params['radius'], list):
                    annotation_radius = annotation_params['radius'][gix]
                else:
                    annotation_radius = annotation_params['radius']
            annotation_distance = 1.0
            if isinstance(annotation_params,
                          dict) and 'distance' in annotation_params:
                if isinstance(annotation_params['radius'], list):
                    annotation_distance = annotation_params['distance'][gix]
                else:
                    annotation_distance = annotation_params['distance']
            annotation_linestyle = dict(color='r', width=2, style='-')
            if isinstance(annotation_params,
                          dict) and 'linestyle' in annotation_params:
                annotation_linestyle = annotation_params['linestyle']

            pts = [[math.log(pt[0], 10), math.log(pt[1], 10)] for pt in grp]
            c, r = selectpoints(ax,
                                pts,
                                radius=annotation_radius,
                                ec=annotation_linestyle['color'],
                                lw=annotation_linestyle['width'],
                                ls=annotation_linestyle['style'],
                                fill=False)

            annotation_shift_vector = [r * annotation_distance, 0.0]
            if isinstance(annotation_params,
                          dict) and 'location' in annotation_params:
                if isinstance(annotation_params['location'], list):
                    if annotation_params['location'][gix] == 'left':
                        annotation_shift_vector = [
                            -r * annotation_distance, 0.0
                        ]
                    elif annotation_params['location'][gix] == 'top':
                        annotation_shift_vector = [
                            0.0, r * annotation_distance
                        ]
                    elif annotation_params['location'][gix] == 'bottom':
                        annotation_shift_vector = [
                            0.0, -r * annotation_distance
                        ]
                else:
                    if annotation_params['location'] == 'left':
                        annotation_shift_vector = [
                            -r * annotation_distance, 0.0
                        ]
                    elif annotation_params['location'] == 'top':
                        annotation_shift_vector = [
                            0.0, r * annotation_distance
                        ]
                    elif annotation_params['location'] == 'bottom':
                        annotation_shift_vector = [
                            0.0, -r * annotation_distance
                        ]

            plt.text(c[0] + annotation_shift_vector[0],
                     c[1] + annotation_shift_vector[1],
                     "%.2f %%" % (weights[gix] * 100))

    # Add annotations for minimum and maximum anonymity sets
    elif isinstance(plot_annotation, bool) and plot_annotation:
        add_annotations(ax, x, y, z, annotationsize, tail_threshold)

    # Setup XY axes
    maxval = max_val_exp
    plt.ylim(-0.5, maxval)
    plt.xlim(-0.5, maxval)
    ticks = range(maxval + 1)
    lbls = ["${10}^{%d}$" % v for v in range(maxval)]
    ax.set_xticks(ticks)
    ax.set_xticklabels(lbls)
    ax.set_yticks(ticks)
    ax.set_yticklabels(lbls)

    # Save file
    plt.tight_layout()
    if '.' in filename:
        plt.savefig(filename)
    else:
        plt.savefig(filename + '.pdf')
        plt.savefig(filename + '.png')
Beispiel #32
0
def bland_altman_plots(df, rep_stats=None, els=['Mg', 'Sr', 'Ba', 'Al', 'Mn']):
    # get corresponding analyte and ratio names
    As = []
    Rs = []
    analytes = [c for c in df.columns if ('_r' not in c) and ('_t' not in c)]
    ratios = [c for c in df.columns if ('_r' in c)]

    for e in els:
        if e == 'Sr':
            As.append('Sr88')
        elif e == 'Mg':
            As.append('Mg24')
        else:
            As.append([a for a in analytes if e in a][0])
        Rs.append([r for r in ratios if e in r][0][:-2])
    
    fig, axs = plt.subplots(len(els), 3, figsize=(6.5, len(els) * 2))
    
    for i, (e, a) in enumerate(zip(Rs, As)):
        if a == 'Ba138':
            m = 1e3
            u = '$\mu$mol/mol'
        else:
            m = 1
            u = 'mmol/mol'
        
        tax, lax, hax = axs[i]
        c=element_colour(a)

        x = df.loc[:, e + '_r'].values * m
        yt = df.loc[:, e + '_t'].values * m
        yl = df.loc[:, a].values * m
        
        # draw Bland-Altman plots
        if rep_stats is None:
            CI = None
        else:
            CI = rep_stats[e][0]
        bland_altman(x, yt, interval=.75, indep_conf=CI, ax=tax, c=c)
        bland_altman(x, yl, interval=.75, indep_conf=CI, ax=lax, c=c)
        
        xlim = (min(tax.get_xlim()[0], lax.get_xlim()[0]), max(tax.get_xlim()[1], lax.get_xlim()[1]))
        tax.set_xlim(xlim)
        lax.set_xlim(xlim)

        ylim = rangecalc(tax.get_ylim(), lax.get_ylim())

        # draw residual PDFs
        # calculate residuals
        rt = yt - x
        rl = yl - x
        # remove NaNs
        rt = rt[~np.isnan(rt)]
        rl = rl[~np.isnan(rl)]
        # calculate bins
        bins = np.linspace(*ylim, 100)
        # calculate KDEs
        kdt = stats.gaussian_kde(rt, .4)
        kdl = stats.gaussian_kde(rl, .4)
        # draw KDEs
        hax.fill_betweenx(bins, kdl(bins), facecolor=element_colour(a), alpha=0.8, edgecolor='k', lw=0.75, label='LAtools', zorder=-1)
        hax.fill_betweenx(bins, kdt(bins), facecolor=element_colour(a), alpha=0.4, edgecolor='k', lw=0.75, label='Manual', zorder=1)
        # limits and horizontal line
        hax.set_xlim([0, hax.get_xlim()[-1]])
        hax.axhline(0, ls='dashed', c='k', alpha=0.6, zorder=-1)

        for ax in axs[i]:
            ax.set_ylim(ylim)
                        
            if ax.is_first_col():
                ax.set_ylabel(e + ' ('+ u + ')\nResidual')
            else:
                ax.set_ylabel('')
                ax.set_yticklabels([])
            
            if ax.is_last_row():
                tax.set_xlabel('Mean')
                lax.set_xlabel('Mean')
                hax.set_xlabel('Residual Density')
                hax.legend()
            else:
                ax.set_xlabel('')

            if ax.is_first_row():
                tax.set_title('Manual Test User', loc='left')
                lax.set_title('LAtools Test User', loc='left')
                hax.set_title('Residuals', loc='left')

    fig.tight_layout()

    return fig, axs
Beispiel #33
0
    markers = ['v', '^', 'd', '_', '|', 's', '8', 's', 'p', '*']

    filname = 'flow_samples_all.txt'
    with open(filname, 'rb') as f:
        data = pickle.load(f)
    x = []
    y = []
    # print(len(data))
    for j in range(0, len(data[1:50000])):
        x.append(data[j][0][0])
        y.append(data[j][0][1])
        # print(j)
    # print(data)
    # len(y)
    xy = np.vstack([x, y])
    z = (gaussian_kde(xy)(xy))
    # x_m = sum(x) / float(len(x))
    # y_m = sum(y) / float(len(x))
    ax.scatter(x, y, c=z, s=10, edgecolor='')

    print("main_done")
    for i in range(0, 10):
        filname = 'flow_samples_' + str(i) + '.txt'
        with open(filname, 'r') as f:
            data = pickle.loads(f.read())
        x = []
        y = []
        # print(len(data))
        for j in range(0, len(data[1:5000])):
            x.append(data[j][0][0])
            y.append(data[j][0][1])
Beispiel #34
0
 def __init__(self, param_list, values, bw_method=None):
     self.param_list = param_list
     self.pdf_estimate = gaussian_kde(values, bw_method=bw_method)
Beispiel #35
0
def comparison_plots(df, els=['Mg', 'Sr', 'Ba', 'Al', 'Mn']):
    """
    Function for plotting Test User and LAtools data comparison.

    Parameters
    ----------
    df : pandas.DataFrame
        A dataframe containing reference ('X/Ca_r'), test user 
        ('X/Ca_t') and LAtools ('X123') data.
    els : list
        list of elements (names only) to plot.
    """
    
    # get corresponding analyte and ratio names
    As = []
    Rs = []
    analytes = [c for c in df.columns if ('_r' not in c) and ('_t' not in c)]
    ratios = [c for c in df.columns if ('_r' in c)]

    for e in els:
        if e == 'Sr':
            As.append('Sr88')
        elif e == 'Mg':
            As.append('Mg24')
        else:
            As.append([a for a in analytes if e in a][0])
        Rs.append([r for r in ratios if e in r][0][:-2])
    
    fig, axs = plt.subplots(len(els), 3, figsize=(6.5, len(els) * 2))
    
    for i, (e, a) in enumerate(zip(Rs, As)):
        if a == 'Ba138':
            m = 1e3
            u = '$\mu$mol/mol'
        else:
            m = 1
            u = 'mmol/mol'
        
        c = element_colour(a)
        
        tax, lax, hax = axs[i]
        
        x = df.loc[:, e + '_r'].values * m
        yt = df.loc[:, e + '_t'].values * m
        yl = df.loc[:, a].values * m
        
        # calculate residuals
        rt = yt - x
        rl = yl - x
        
        # plot residuals
        tax.scatter(x, yt, c=c, s=15, lw=0.5, edgecolor='k', alpha=0.5)
        lax.scatter(x, yl, c=c, s=15, lw=0.5, edgecolor='k', alpha=0.5)
        
        # plot PDFs
        rt = rt[~np.isnan(rt)]
        rl = rl[~np.isnan(rl)]
        lims = np.percentile(np.hstack([rt, rl]), [99, 1])
        lims += np.ptp(lims) * np.array((-1.25, 1.25))
        bins = np.linspace(*lims, 100)
        kdt = stats.gaussian_kde(rt, .4)
        kdl = stats.gaussian_kde(rl, .4)
        hax.fill_between(bins, kdl(bins), facecolor=c, alpha=0.7, edgecolor='k', lw=0.5, label='LAtools')
        hax.fill_between(bins, kdt(bins), facecolor=c, alpha=0.4, edgecolor='k', lw=0.5, label='Test User')
        hax.set_ylim([0, hax.get_ylim()[-1]])
        hax.set_xlim(lims)
        hax.axvline(0, c='k', ls='dashed', alpha=0.6)
        # hax.set_yticklabels([])
        hax.set_ylabel('Density')
        
        # axis labels, annotations and limits
        tax.set_ylabel(e + ' ('+ u + ')')
        tax.text(.02,.98,fmt_RSS(rt), fontsize=8,
                 ha='left', va='top', transform=tax.transAxes)
        lax.text(.02,.98,fmt_RSS(rl), fontsize=8,
                 ha='left', va='top', transform=lax.transAxes)

        xlim = np.percentile(x[~np.isnan(x)], [0, 98])
        for ax in [tax, lax]:
            ax.set_xlim(xlim)
            ax.set_ylim(xlim)
            
            ax.plot(xlim, xlim, c='k', ls='dashed', alpha=0.6)
        
        for ax in axs[i]:
            if ax.is_last_row():
                hax.set_xlabel('Residual')
                tax.set_xlabel('Reference User')
                lax.set_xlabel('Reference User')
                hax.legend(fontsize=8)

            if ax.is_first_row():
                tax.set_title('Manual Test User', loc='left')
                lax.set_title('LAtools Test User', loc='left')
            
    fig.tight_layout()
    return fig, axs
    StopCond = 0
    Stop = 20
    NormalIDX = 20

    TargetData = TrainECG[TrainECG.keys()[NormalIDX]]

    for idx, key in enumerate(TestLabel):
        if TestLabel[key] == "V":
            TargetData2 = TestECG[key]
            StopCond += 1

        if StopCond == Stop:
            break

    Density_V = gaussian_kde(TargetData2)
    Domain_V = np.linspace(-max(TargetData2), max(TargetData2), 1000)
    Density = gaussian_kde(TargetData)
    Domain = np.linspace(-max(TargetData), max(TargetData), 1000)

    plt.figure()
    plt.title("V")
    plt.plot(Domain_V, Density_V(Domain_V))
    plt.grid()

    plt.figure()
    plt.title("N")
    plt.plot(Domain, Density(Domain))
    plt.grid()

    plt.show()
Beispiel #37
0
print(avg_v)
print(avg_celldist * avg_v)

fig = plt.figure(figsize=(18, 6))
fig.suptitle("Material Project Dataset")

ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=4)
w = 0.2
xint = np.arange(7)
ax1.bar(xint - w, avg_lv[:, 0], width=w, color='b', align='center')
ax1.bar(xint, avg_lv[:, 1], width=w, color='r', align='center')
ax1.bar(xint + w, avg_lv[:, 2], width=w, color='g', align='center')
ax1.set_xticks(xint)
ax1.set_xticklabels(
    ('Tri-', 'Mono-', "Ortho-", "Tetra-", "Trig-", "Hexa-", "Cubic"))
plt.title("Average min/median/max lattice vector")

ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=4)
xmax = rmax_list.max()
for i in range(7):
    indices = np.where(type_list == i)
    ds = rmax_list[indices]
    density = gaussian_kde(ds)
    xs = np.linspace(0, xmax, 200)
    density.covariance_factor = lambda: .25
    density._compute_covariance()
    plt.plot(xs, density(xs))
plt.legend(('Tri-', 'Mono-', "Ortho-", "Tetra-", "Trig-", "Hexa-", "Cubic"))
plt.title("Gaussian density of rmax distribution")

plt.show()
Beispiel #38
0
    tempData = yrData.loc[yrData.Month == m]
    tempData = tempData.loc[tempData.Day == d]

    col = 'yr' + str(yr)
    print(col)
    dfTempDay[col] = tempData.Temperature.values
    dfGHIDay[col] = tempData.GHI.values

#%%

from scipy import stats as st
import numpy as np

sampleKDE_F = dfTempDay.iloc[0]
my_kde = st.gaussian_kde(sampleKDE_F)
sampleKDE_F = my_kde.resample(1)[0][0]

#%%
#
##%% Parse Date
#
#tempDayData = pd.DataFrame(0, index=np.arange(24), columns=col)
#
##yr = 2007;
#month = 7;
#day = 1;
#i = 0;
#
#for key, yrData in data.items():
#    j = 'yr'+str(key)
Beispiel #39
0
        spread.append(p[t] / r[t] * 100)
        leverage.append(a[t] / (p[t] * np.average(n_equity)))
    Spread.append(spread)
    Leverage.append(leverage)
##############
x = []
for j in range(iterations - cut_time - 1):
    for i in range(sim):
        x.append(j)
y_spread = []
y_leverage = []
for i in range(sim):
    for j in range(iterations - cut_time - 1):
        y_spread.append(Spread[i][j])
        y_leverage.append(Leverage[i][j])
z_spread = gaussian_kde(y_spread)(y_spread)
z_leverage = gaussian_kde(y_leverage)(y_leverage)
idx_spread = z_spread.argsort()
idx_leverage = z_leverage.argsort()
x_spread = copy.deepcopy(x)
x_leverage = copy.deepcopy(x)
# x_spread, y_spread, z_spread = x_spread[idx_spread], y_spread[idx_spread], z_spread[idx_spread]
# x_leverage, y_leverage, z_leverage = x_leverage[idx_leverage], y_leverage[idx_leverage], z_leverage[idx_leverage]
##############
fig1, ax1 = plt.subplots()
cax1 = ax1.scatter(x_spread, y_spread, c=z_spread, s=30, edgecolor='')
ax1.set_title('Price Spread')
fig1.colorbar(cax1)
fig2, ax2 = plt.subplots()
cax2 = ax2.scatter(x_leverage, y_leverage, c=z_leverage, s=30, edgecolor='')
ax2.set_title('Leverage')
Beispiel #40
0
def summary_plot(
    shap_values,
    features=None,
    feature_names=None,
    max_display=None,
    plot_type=None,
    color=None,
    axis_color="#333333",
    title=None,
    alpha=1,
    show=True,
    get_png=False,
    sort=True,
    color_bar=True,
    plot_size="auto",
    layered_violin_max_num_bins=20,
    class_names=None,
    class_inds=None,
    color_bar_label=labels["FEATURE_VALUE"],
    # depreciated
    auto_size_plot=None,
):
    """Create a SHAP summary plot, colored by feature values when they are provided.

    Parameters
    ----------
    shap_values : numpy.array
        For single output explanations this is a matrix of SHAP values (# samples x # features).
        For multi-output explanations this is a list of such matrices of SHAP values.

    features : numpy.array or pandas.DataFrame or list
        Matrix of feature values (# samples x # features) or a feature_names list as shorthand

    feature_names : list
        Names of the features (length # features)

    max_display : int
        How many top features to include in the plot (default is 20, or 7 for interaction plots)

    plot_type : "dot" (default for single output), "bar" (default for multi-output), "violin",
        or "compact_dot".
        What type of summary plot to produce. Note that "compact_dot" is only used for
        SHAP interaction values.

    plot_size : "auto" (default), float, (float, float), or None
        What size to make the plot. By default the size is auto-scaled based on the number of
        features that are being displayed. Passing a single float will cause each row to be that 
        many inches high. Passing a pair of floats will scale the plot by that
        number of inches. If None is passed then the size of the current figure will be left
        unchanged.
    """

    # deprecation warnings
    if auto_size_plot is not None:
        warnings.warn(
            "auto_size_plot=False is deprecated and is now ignored! Use plot_size=None instead."
        )

    multi_class = False
    if isinstance(shap_values, list):
        multi_class = True
        if plot_type is None:
            plot_type = "bar"  # default for multi-output explanations
        assert plot_type == "bar", "Only plot_type = 'bar' is supported for multi-output explanations!"
    else:
        if plot_type is None:
            plot_type = "dot"  # default for single output explanations
        assert len(
            shap_values.shape
        ) != 1, "Summary plots need a matrix of shap_values, not a vector."

    # default color:
    if color is None:
        if plot_type == 'layered_violin':
            color = "coolwarm"
        elif multi_class:
            color = lambda i: colors.red_blue_circle(i / len(shap_values))
        else:
            color = colors.blue_rgb

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = features.columns
        features = features.values
    elif isinstance(features, list):
        if feature_names is None:
            feature_names = features
        features = None
    elif (features is not None) and len(
            features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    num_features = (shap_values[0].shape[1]
                    if multi_class else shap_values.shape[1])

    if features is not None:
        shape_msg = "The shape of the shap_values matrix does not match the shape of the " \
                    "provided data matrix."
        if num_features - 1 == features.shape[1]:
            assert False, shape_msg + " Perhaps the extra column in the shap_values matrix is the " \
                          "constant offset? Of so just pass shap_values[:,:-1]."
        else:
            assert num_features == features.shape[1], shape_msg

    if feature_names is None:
        feature_names = np.array(
            [labels['FEATURE'] % str(i) for i in range(num_features)])

    # plotting SHAP interaction values
    if not multi_class and len(shap_values.shape) == 3:

        if plot_type == "compact_dot":
            new_shap_values = shap_values.reshape(shap_values.shape[0], -1)
            new_features = np.tile(features,
                                   (1, 1, features.shape[1])).reshape(
                                       features.shape[0], -1)

            new_feature_names = []
            for c1 in feature_names:
                for c2 in feature_names:
                    if c1 == c2:
                        new_feature_names.append(c1)
                    else:
                        new_feature_names.append(c1 + "* - " + c2)

            return summary_plot(new_shap_values,
                                new_features,
                                new_feature_names,
                                max_display=max_display,
                                plot_type="dot",
                                color=color,
                                axis_color=axis_color,
                                title=title,
                                alpha=alpha,
                                show=show,
                                sort=sort,
                                color_bar=color_bar,
                                plot_size=plot_size,
                                class_names=class_names,
                                color_bar_label="*" + color_bar_label)

        if max_display is None:
            max_display = 7
        else:
            max_display = min(len(feature_names), max_display)

        sort_inds = np.argsort(-np.abs(shap_values.sum(1)).sum(0))

        # get plotting limits
        delta = 1.0 / (shap_values.shape[1]**2)
        slow = np.nanpercentile(shap_values, delta)
        shigh = np.nanpercentile(shap_values, 100 - delta)
        v = max(abs(slow), abs(shigh))
        slow = -v
        shigh = v

        pl.figure(figsize=(1.5 * max_display + 1, 0.8 * max_display + 1))
        pl.subplot(1, max_display, 1)
        proj_shap_values = shap_values[:, sort_inds[0], sort_inds]
        proj_shap_values[:,
                         1:] *= 2  # because off diag effects are split in half
        summary_plot(proj_shap_values,
                     features[:, sort_inds] if features is not None else None,
                     feature_names=feature_names[sort_inds],
                     sort=False,
                     show=False,
                     get_png=get_png,
                     color_bar=False,
                     plot_size=None,
                     max_display=max_display)
        pl.xlim((slow, shigh))
        pl.xlabel("")
        title_length_limit = 11
        pl.title(shorten_text(feature_names[sort_inds[0]], title_length_limit))
        for i in range(1, min(len(sort_inds), max_display)):
            ind = sort_inds[i]
            pl.subplot(1, max_display, i + 1)
            proj_shap_values = shap_values[:, ind, sort_inds]
            proj_shap_values *= 2
            proj_shap_values[:,
                             i] /= 2  # because only off diag effects are split in half
            summary_plot(proj_shap_values,
                         features[:,
                                  sort_inds] if features is not None else None,
                         sort=False,
                         get_png=get_png,
                         feature_names=["" for i in range(len(feature_names))],
                         show=False,
                         color_bar=False,
                         plot_size=None,
                         max_display=max_display)
            pl.xlim((slow, shigh))
            pl.xlabel("")
            if i == min(len(sort_inds), max_display) // 2:
                pl.xlabel(labels['INTERACTION_VALUE'])
            pl.title(shorten_text(feature_names[ind], title_length_limit))
        pl.tight_layout(pad=0, w_pad=0, h_pad=0.0)
        pl.subplots_adjust(hspace=0, wspace=0.1)
        if show:
            pl.show()
        return

    if max_display is None:
        max_display = 20

    if sort:
        # order features by the sum of their effect magnitudes
        if multi_class:
            feature_order = np.argsort(
                np.sum(np.mean(np.abs(shap_values), axis=1), axis=0))
        else:
            feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0))
        feature_order = feature_order[-min(max_display, len(feature_order)):]
    else:
        feature_order = np.flip(np.arange(min(max_display, num_features)), 0)

    row_height = 0.4
    if plot_size == "auto":
        pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5)
    elif type(plot_size) in (list, tuple):
        pl.gcf().set_size_inches(plot_size[0], plot_size[1])
    elif plot_size is not None:
        pl.gcf().set_size_inches(8, len(feature_order) * plot_size + 1.5)
    pl.axvline(x=0, color="#999999", zorder=-1)

    if plot_type == "dot":
        for pos, i in enumerate(feature_order):
            pl.axhline(y=pos,
                       color="#cccccc",
                       lw=0.5,
                       dashes=(1, 5),
                       zorder=-1)
            shaps = shap_values[:, i]
            values = None if features is None else features[:, i]
            inds = np.arange(len(shaps))
            np.random.shuffle(inds)
            if values is not None:
                values = values[inds]
            shaps = shaps[inds]
            colored_feature = True
            try:
                values = np.array(
                    values, dtype=np.float64)  # make sure this can be numeric
            except:
                colored_feature = False
            N = len(shaps)
            # hspacing = (np.max(shaps) - np.min(shaps)) / 200
            # curr_bin = []
            nbins = 100
            quant = np.round(nbins * (shaps - np.min(shaps)) /
                             (np.max(shaps) - np.min(shaps) + 1e-8))
            inds = np.argsort(quant + np.random.randn(N) * 1e-6)
            layer = 0
            last_bin = -1
            ys = np.zeros(N)
            for ind in inds:
                if quant[ind] != last_bin:
                    layer = 0
                ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
                layer += 1
                last_bin = quant[ind]
            ys *= 0.9 * (row_height / np.max(ys + 1))

            if features is not None and colored_feature:
                # trim the color range, but prevent the color range from collapsing
                vmin = np.nanpercentile(values, 5)
                vmax = np.nanpercentile(values, 95)
                if vmin == vmax:
                    vmin = np.nanpercentile(values, 1)
                    vmax = np.nanpercentile(values, 99)
                    if vmin == vmax:
                        vmin = np.min(values)
                        vmax = np.max(values)
                if vmin > vmax:  # fixes rare numerical precision issues
                    vmin = vmax

                assert features.shape[0] == len(
                    shaps
                ), "Feature and SHAP matrices must have the same number of rows!"

                # plot the nan values in the interaction feature as grey
                nan_mask = np.isnan(values)
                pl.scatter(shaps[nan_mask],
                           pos + ys[nan_mask],
                           color="#777777",
                           vmin=vmin,
                           vmax=vmax,
                           s=16,
                           alpha=alpha,
                           linewidth=0,
                           zorder=3,
                           rasterized=len(shaps) > 500)

                # plot the non-nan values colored by the trimmed feature value
                cvals = values[np.invert(nan_mask)].astype(np.float64)
                cvals_imp = cvals.copy()
                cvals_imp[np.isnan(cvals)] = (vmin + vmax) / 2.0
                cvals[cvals_imp > vmax] = vmax
                cvals[cvals_imp < vmin] = vmin
                pl.scatter(shaps[np.invert(nan_mask)],
                           pos + ys[np.invert(nan_mask)],
                           cmap=colors.red_blue,
                           vmin=vmin,
                           vmax=vmax,
                           s=16,
                           c=cvals,
                           alpha=alpha,
                           linewidth=0,
                           zorder=3,
                           rasterized=len(shaps) > 500)
            else:

                pl.scatter(shaps,
                           pos + ys,
                           s=16,
                           alpha=alpha,
                           linewidth=0,
                           zorder=3,
                           color=color if colored_feature else "#777777",
                           rasterized=len(shaps) > 500)

    elif plot_type == "violin":
        for pos, i in enumerate(feature_order):
            pl.axhline(y=pos,
                       color="#cccccc",
                       lw=0.5,
                       dashes=(1, 5),
                       zorder=-1)

        if features is not None:
            global_low = np.nanpercentile(
                shap_values[:, :len(feature_names)].flatten(), 1)
            global_high = np.nanpercentile(
                shap_values[:, :len(feature_names)].flatten(), 99)
            for pos, i in enumerate(feature_order):
                shaps = shap_values[:, i]
                shap_min, shap_max = np.min(shaps), np.max(shaps)
                rng = shap_max - shap_min
                xs = np.linspace(
                    np.min(shaps) - rng * 0.2,
                    np.max(shaps) + rng * 0.2, 100)
                if np.std(shaps) < (global_high - global_low) / 100:
                    ds = gaussian_kde(shaps + np.random.randn(len(shaps)) *
                                      (global_high - global_low) / 100)(xs)
                else:
                    ds = gaussian_kde(shaps)(xs)
                ds /= np.max(ds) * 3

                values = features[:, i]
                window_size = max(10, len(values) // 20)
                smooth_values = np.zeros(len(xs) - 1)
                sort_inds = np.argsort(shaps)
                trailing_pos = 0
                leading_pos = 0
                running_sum = 0
                back_fill = 0
                for j in range(len(xs) - 1):

                    while leading_pos < len(shaps) and xs[j] >= shaps[
                            sort_inds[leading_pos]]:
                        running_sum += values[sort_inds[leading_pos]]
                        leading_pos += 1
                        if leading_pos - trailing_pos > 20:
                            running_sum -= values[sort_inds[trailing_pos]]
                            trailing_pos += 1
                    if leading_pos - trailing_pos > 0:
                        smooth_values[j] = running_sum / (leading_pos -
                                                          trailing_pos)
                        for k in range(back_fill):
                            smooth_values[j - k - 1] = smooth_values[j]
                    else:
                        back_fill += 1

                vmin = np.nanpercentile(values, 5)
                vmax = np.nanpercentile(values, 95)
                if vmin == vmax:
                    vmin = np.nanpercentile(values, 1)
                    vmax = np.nanpercentile(values, 99)
                    if vmin == vmax:
                        vmin = np.min(values)
                        vmax = np.max(values)

                # plot the nan values in the interaction feature as grey
                nan_mask = np.isnan(values)
                pl.scatter(shaps[nan_mask],
                           np.ones(shap_values[nan_mask].shape[0]) * pos,
                           color="#777777",
                           vmin=vmin,
                           vmax=vmax,
                           s=9,
                           alpha=alpha,
                           linewidth=0,
                           zorder=1)
                # plot the non-nan values colored by the trimmed feature value
                cvals = values[np.invert(nan_mask)].astype(np.float64)
                cvals_imp = cvals.copy()
                cvals_imp[np.isnan(cvals)] = (vmin + vmax) / 2.0
                cvals[cvals_imp > vmax] = vmax
                cvals[cvals_imp < vmin] = vmin
                pl.scatter(shaps[np.invert(nan_mask)],
                           np.ones(shap_values[np.invert(nan_mask)].shape[0]) *
                           pos,
                           cmap=colors.red_blue,
                           vmin=vmin,
                           vmax=vmax,
                           s=9,
                           c=cvals,
                           alpha=alpha,
                           linewidth=0,
                           zorder=1)
                # smooth_values -= nxp.nanpercentile(smooth_values, 5)
                # smooth_values /= np.nanpercentile(smooth_values, 95)
                smooth_values -= vmin
                if vmax - vmin > 0:
                    smooth_values /= vmax - vmin
                for i in range(len(xs) - 1):
                    if ds[i] > 0.05 or ds[i + 1] > 0.05:
                        pl.fill_between(
                            [xs[i], xs[i + 1]], [pos + ds[i], pos + ds[i + 1]],
                            [pos - ds[i], pos - ds[i + 1]],
                            color=colors.red_blue_no_bounds(smooth_values[i]),
                            zorder=2)

        else:
            parts = pl.violinplot(shap_values[:, feature_order],
                                  range(len(feature_order)),
                                  points=200,
                                  vert=False,
                                  widths=0.7,
                                  showmeans=False,
                                  showextrema=False,
                                  showmedians=False)

            for pc in parts['bodies']:
                pc.set_facecolor(color)
                pc.set_edgecolor('none')
                pc.set_alpha(alpha)

    elif plot_type == "layered_violin":  # courtesy of @kodonnell
        num_x_points = 200
        bins = np.linspace(
            0, features.shape[0], layered_violin_max_num_bins + 1
        ).round(0).astype(
            'int')  # the indices of the feature data corresponding to each bin
        shap_min, shap_max = np.min(shap_values), np.max(shap_values)
        x_points = np.linspace(shap_min, shap_max, num_x_points)

        # loop through each feature and plot:
        for pos, ind in enumerate(feature_order):
            # decide how to handle: if #unique < layered_violin_max_num_bins then split by unique value, otherwise use bins/percentiles.
            # to keep simpler code, in the case of uniques, we just adjust the bins to align with the unique counts.
            feature = features[:, ind]
            unique, counts = np.unique(feature, return_counts=True)
            if unique.shape[0] <= layered_violin_max_num_bins:
                order = np.argsort(unique)
                thesebins = np.cumsum(counts[order])
                thesebins = np.insert(thesebins, 0, 0)
            else:
                thesebins = bins
            nbins = thesebins.shape[0] - 1
            # order the feature data so we can apply percentiling
            order = np.argsort(feature)
            # x axis is located at y0 = pos, with pos being there for offset
            y0 = np.ones(num_x_points) * pos
            # calculate kdes:
            ys = np.zeros((nbins, num_x_points))
            for i in range(nbins):
                # get shap values in this bin:
                shaps = shap_values[order[thesebins[i]:thesebins[i + 1]], ind]
                # if there's only one element, then we can't
                if shaps.shape[0] == 1:
                    warnings.warn(
                        "not enough data in bin #%d for feature %s, so it'll be ignored. Try increasing the number of records to plot."
                        % (i, feature_names[ind]))
                    # to ignore it, just set it to the previous y-values (so the area between them will be zero). Not ys is already 0, so there's
                    # nothing to do if i == 0
                    if i > 0:
                        ys[i, :] = ys[i - 1, :]
                    continue
                # save kde of them: note that we add a tiny bit of gaussian noise to avoid singular matrix errors
                ys[i, :] = gaussian_kde(shaps + np.random.normal(
                    loc=0, scale=0.001, size=shaps.shape[0]))(x_points)
                # scale it up so that the 'size' of each y represents the size of the bin. For continuous data this will
                # do nothing, but when we've gone with the unqique option, this will matter - e.g. if 99% are male and 1%
                # female, we want the 1% to appear a lot smaller.
                size = thesebins[i + 1] - thesebins[i]
                bin_size_if_even = features.shape[0] / nbins
                relative_bin_size = size / bin_size_if_even
                ys[i, :] *= relative_bin_size
            # now plot 'em. We don't plot the individual strips, as this can leave whitespace between them.
            # instead, we plot the full kde, then remove outer strip and plot over it, etc., to ensure no
            # whitespace
            ys = np.cumsum(ys, axis=0)
            width = 0.8
            scale = ys.max(
            ) * 2 / width  # 2 is here as we plot both sides of x axis
            for i in range(nbins - 1, -1, -1):
                y = ys[i, :] / scale
                c = pl.get_cmap(color)(
                    i / (nbins - 1)
                ) if color in pl.cm.datad else color  # if color is a cmap, use it, otherwise use a color
                pl.fill_between(x_points, pos - y, pos + y, facecolor=c)
        pl.xlim(shap_min, shap_max)

    elif not multi_class and plot_type == "bar":
        feature_inds = feature_order[:max_display]
        y_pos = np.arange(len(feature_inds))
        global_shap_values = np.abs(shap_values).mean(0)
        pl.barh(y_pos,
                global_shap_values[feature_inds],
                0.7,
                align='center',
                color=color)
        pl.yticks(y_pos, fontsize=13)
        pl.gca().set_yticklabels([feature_names[i] for i in feature_inds])

    elif multi_class and plot_type == "bar":
        if class_names is None:
            class_names = ["Class " + str(i) for i in range(len(shap_values))]
        feature_inds = feature_order[:max_display]
        y_pos = np.arange(len(feature_inds))
        left_pos = np.zeros(len(feature_inds))

        if class_inds is None:
            class_inds = np.argsort([
                -np.abs(shap_values[i]).mean() for i in range(len(shap_values))
            ])
        elif class_inds == "original":
            class_inds = range(len(shap_values))
        for i, ind in enumerate(class_inds):
            global_shap_values = np.abs(shap_values[ind]).mean(0)
            pl.barh(y_pos,
                    global_shap_values[feature_inds],
                    0.7,
                    left=left_pos,
                    align='center',
                    color=color(i),
                    label=class_names[ind])
            left_pos += global_shap_values[feature_inds]
        pl.yticks(y_pos, fontsize=13)
        pl.gca().set_yticklabels([feature_names[i] for i in feature_inds])
        pl.legend(frameon=False, fontsize=12)

    # draw the color bar
    if color_bar and features is not None and plot_type != "bar" and \
            (plot_type != "layered_violin" or color in pl.cm.datad):
        import matplotlib.cm as cm
        m = cm.ScalarMappable(
            cmap=colors.red_blue if plot_type != "layered_violin" else pl.
            get_cmap(color))
        m.set_array([0, 1])
        cb = pl.colorbar(m, ticks=[0, 1], aspect=1000)
        cb.set_ticklabels(
            [labels['FEATURE_VALUE_LOW'], labels['FEATURE_VALUE_HIGH']])
        cb.set_label(color_bar_label, size=12, labelpad=0)
        cb.ax.tick_params(labelsize=11, length=0)
        cb.set_alpha(1)
        cb.outline.set_visible(False)
        bbox = cb.ax.get_window_extent().transformed(
            pl.gcf().dpi_scale_trans.inverted())
        cb.ax.set_aspect((bbox.height - 0.9) * 20)
        # cb.draw_all()

    pl.gca().xaxis.set_ticks_position('bottom')
    pl.gca().yaxis.set_ticks_position('none')
    pl.gca().spines['right'].set_visible(False)
    pl.gca().spines['top'].set_visible(False)
    pl.gca().spines['left'].set_visible(False)
    pl.gca().tick_params(color=axis_color, labelcolor=axis_color)
    pl.yticks(range(len(feature_order)),
              [feature_names[i] for i in feature_order],
              fontsize=13)
    if plot_type != "bar":
        pl.gca().tick_params('y', length=20, width=0.5, which='major')
    pl.gca().tick_params('x', labelsize=11)
    pl.ylim(-1, len(feature_order))
    if plot_type == "bar":
        pl.xlabel(labels['GLOBAL_VALUE'], fontsize=13)
    else:
        pl.xlabel(labels['VALUE'], fontsize=13)
    if show:
        pl.show()
    if get_png:
        file = BytesIO()
        pl.savefig(file, format='png', bbox_inches="tight")
        return file
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
x_eval = np.linspace(-10, 10, num=200)
kde1 = stats.gaussian_kde(x1)
kde2 = stats.gaussian_kde(x1, bw_method='silverman')


def my_kde_bandwidth(obj, fac=1. / 5):
    """We use Scott's Rule, multiplied by a constant factor."""
    return np.power(obj.n, -1. / (obj.d + 4)) * fac


fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(x1, np.zeros(x1.shape), 'b+', ms=20)  # rug plot
kde3 = stats.gaussian_kde(x1, bw_method=my_kde_bandwidth)
ax.plot(x_eval, kde3(x_eval), 'g-', label="With smaller BW")

plt.show()
def twod_kde(x,y):
    X, Y = np.mgrid[x.min()*0.9:x.max()*1.1:100j, y.min()*0.9:y.max()*1.1:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = gaussian_kde(values)
    return X, Y, np.reshape(kernel(positions).T, X.shape)
Beispiel #43
0
def plot_errors(model, X_test, y_test, scaler_l):
    # plot MAPEs
    predicted_labels = _back_scaling(model.predict(X_test), scaler_l.data_min_,
                                     scaler_l.data_max_)
    true_labels = _back_scaling(y_test, scaler_l.data_min_, scaler_l.data_max_)

    # compute mape for each value
    res = {}
    means = {}
    stdvs = {}

    for index, value in enumerate(predicted_labels[:, opt.window - 1, :]):
        if int(true_labels[index, opt.window - 1, 0]) in res:
            res[int(true_labels[index, opt.window - 1, 0])].append(
                np.abs(predicted_labels[index, opt.window - 1, 0] -
                       true_labels[index, opt.window - 1, 0]) /
                true_labels[index, opt.window - 1, 0])
        else:
            res[int(true_labels[index, opt.window - 1, 0])] = [
                np.abs(predicted_labels[index, opt.window - 1, 0] -
                       true_labels[index, opt.window - 1, 0]) /
                true_labels[index, opt.window - 1, 0]
            ]
    for i, j in res.items():
        means[i] = np.sum(j) / len(j) * 100
        stdvs[i] = np.std(j) * 100

    lists_means = sorted(means.items())
    lists_stdvs = sorted(stdvs.items())
    x1, y1 = zip(*lists_means)
    x2, y2 = zip(*lists_stdvs)

    density = stats.gaussian_kde(true_labels[:, opt.window - 1, :].flatten())
    s = np.sum(true_labels[:, opt.window - 1, :].flatten())

    fig = plt.figure(figsize=(3, 2))
    ax = fig.add_subplot(111)

    ax.set_xlabel('# Flows / Sequence')

    lns1 = ax.plot(x1, density(x1) * s, label='Counts')
    ax2 = ax.twinx()
    lns2 = ax2.plot(x1,
                    y1,
                    color='tab:red',
                    label='MAPE (%)',
                    linestyle='--',
                    linewidth=2,
                    alpha=0.7)
    lns2b = ax2.fill_between(x1,
                             np.array(y1) - np.array(y2),
                             np.array(y1) + np.array(y2),
                             color='r',
                             alpha=0.2)

    lns = lns1 + lns2
    labs = [l.get_label() for l in lns]
    ax.legend(lns, labs, loc=0)

    ax.grid(alpha=0.4)
    ax.set_xlabel('# Flows / Sequence')
    ax.set_ylabel('Counts')
    ax2.set_ylabel('MAPE (%)')
    ax2.set_ylim(0, 100)
    plt.savefig("errors_{}_{}.pdf".format(
        opt.window,
        datetime.now().strftime("%Y%m%d-%H%M%S")),
                bbox_inches='tight')
Beispiel #44
0
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
t1 = time.time()
print "Grid mesh: ", t1-t0

t2 = time.time()
positions = np.vstack([xx.ravel(), yy.ravel()])
t3 = time.time()
print "Positions: ", t3-t2

t4 = time.time()
values = np.vstack([x, y])
t5 = time.time()
print "Values:    ", t5-t4

t6 = time.time()
kernel = st.gaussian_kde(values)
t7 = time.time()
print "Kernel:    ", t7-t6

t8 = time.time()
f = np.reshape(kernel(positions).T, xx.shape)
t9 = time.time()
print "Reshape:   ", t9-t8
# print f.shape


cdict1 = {'blue':   ((0.00, 1.0, 1.0),
                    (0.10, 1.0, 1.0),
                    (0.20, 1.0, 1.0),
                    (0.40, 1.0, 1.0),
                    (0.60, 1.0, 1.0),
Beispiel #45
0
        #True Peak location
        pt = np.array([np.round(rib / 2), np.round(rib / 2)])

        #Setting the background and the consequent point source flux
        bkg = 100
        bkg_arr = np.random.poisson(np.ones_like(X) * bkg, X.shape)
        flux = 100 * (1 / 0.001)
        sig = 1.5

        #Draw PSF from peaks with 1.5 pixel falloff, and add background
        d = flux * np.exp((-(pt[0] - Y)**2 - (pt[1] - X)**2) / sig**2)
        d += bkg_arr  #Adding poissoninan noise to bkg
        '''Estimate background instead using the KDE method'''
        dd = d[d < np.nanpercentile(d, [75])]
        kernel = stats.gaussian_kde(dd.flatten(), bw_method='scott')
        alpha = np.linspace(dd.min(), dd.max(), 10000)
        bkg_est[idx] = alpha[np.argmax(kernel(alpha))]

        # bkg_est[idx] = np.median(d[d < np.nanpercentile(d,[20])])

        if plots_on:
            if idx == 0 or idx == 3:
                fig, ax = plt.subplots()
                c = ax.imshow(d)
                fig.colorbar(c, label='Flux (arbitrary units)')
                ax.set_xlabel('Pixel #')
                ax.set_ylabel('Pixel #')
                ax.set_title('Total pixels: ' + str(numpix[idx]))
                plt.show()
Beispiel #46
0
def ALvsNLcomp19():
    fig = plt.figure()
    ax1 = fig.add_axes([0, 0, 0.5, 0.9], xlim=(0.35, 0.51))
    ax2 = fig.add_axes([0.6, 0, 0.5, 0.9], xlim=(3, 6))

    fig.suptitle('American vs National League Comparisons, 2019 Season')
    ax1.set_xlabel('AL vs NL Slugging Percentage (%)')
    ax2.set_xlabel('AL vs NL Earned Run Average')
    ax1.hist([AL19['SLG'], NL19['SLG']],
             bins=8,
             label=['AL', 'NL'],
             linewidth=1,
             density=True,
             alpha=0.4,
             edgecolor='black',
             align='right')
    ax2.hist([AL19['ERA'], NL19['ERA']],
             bins=8,
             label=['AL', 'NL'],
             linewidth=1,
             density=True,
             alpha=0.4,
             edgecolor='black',
             align='right')

    En, Ex = min(AL19['ERA']) - 0.2 * (np.mean(AL19['ERA'])), max(
        AL19['ERA']) + 0.1 * (np.mean(AL19['ERA']))
    Bn, Bx = min(AL19['SLG']) - 0.2 * (np.mean(AL19['SLG'])), max(
        AL19['SLG']) + 0.2 * (np.mean(AL19['SLG']))
    kde_BA = np.linspace(Bn, Bx, 301)
    kde_ERA = np.linspace(En, Ex, 301)

    AL_BA_kde19 = st.gaussian_kde(AL19['SLG'].dropna())
    NL_BA_kde19 = st.gaussian_kde(NL19['SLG'].dropna())
    AL_ERA_kde19 = st.gaussian_kde(AL19['ERA'].dropna())
    NL_ERA_kde19 = st.gaussian_kde(NL19['ERA'].dropna())

    ax1.plot(kde_BA,
             AL_BA_kde19.pdf(kde_BA),
             color='blue',
             linewidth=2,
             alpha=0.8)
    ax1.plot(kde_BA,
             NL_BA_kde19.pdf(kde_BA),
             color='orange',
             linewidth=2.5,
             alpha=0.95)
    ax2.plot(kde_ERA,
             AL_ERA_kde19.pdf(kde_ERA),
             color='blue',
             linewidth=2,
             alpha=0.8)
    ax2.plot(kde_ERA,
             NL_ERA_kde19.pdf(kde_ERA),
             color='orange',
             linewidth=2,
             alpha=0.8)
    ax1.xaxis.set_ticks(np.arange(0.36, 0.50, 0.02))
    ax2.xaxis.set_ticks(np.arange(3.0, 6.0, 0.5))
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    plt.show()

    plt.close()
Beispiel #47
0
def main(args):

    # matplotlib settings
    plt.rc('font', family='serif')

    if args.two_col:
        plt.rc('xtick', labelsize=11)
        plt.rc('ytick', labelsize=11)
        plt.rc('axes', labelsize=11)
        plt.rc('axes', titlesize=11)
        plt.rc('legend', fontsize=11)
        plt.rc('legend', title_fontsize=11)
        plt.rc('lines', linewidth=1)
        plt.rc('lines', markersize=3)

        width = 3.25  # Two column style
        width, height = set_size(width=width * 2, fraction=1, subplots=(2, 2))
        fig, axs = plt.subplots(2, 2, figsize=(width, height * 1.25))

    else:
        # matplotlib settings
        plt.rc('font', family='serif')
        plt.rc('xtick', labelsize=18)
        plt.rc('ytick', labelsize=18)
        plt.rc('axes', labelsize=21)
        plt.rc('axes', titlesize=21)
        plt.rc('legend', fontsize=19)
        plt.rc('legend', title_fontsize=11)
        plt.rc('lines', linewidth=1)
        plt.rc('lines', markersize=6)

        width = 5.5  # Neurips 2020
        width, height = set_size(width=width * 3, fraction=1, subplots=(1, 4))
        fig, axs = plt.subplots(1, 4, figsize=(width, height * 1.25))

    axs = axs.flatten()

    results = get_results(args)

    train_feature_vals = results['train_feature_vals']
    train_feature_bins = results['train_feature_bins']
    train_pos_ndx = results['train_pos_ndx']
    train_neg_ndx = results['train_neg_ndx']
    train_weight = results['train_weight']
    train_sim = results['train_sim']
    feature_name = results['target_feature']
    test_val = results['test_val']

    train_sim_weight = train_weight * train_sim

    # gamma vs alpha
    print('plotting gamma vs alpha...')
    ax = axs[0]
    xy = np.vstack([train_weight, train_sim])
    z = gaussian_kde(xy)(xy)
    ax.scatter(train_weight,
               train_sim,
               c=z,
               s=20,
               edgecolor='',
               rasterized=args.rasterize)
    ax.axhline(0, color='k')
    ax.axvline(0, color='k')
    ax.set_ylabel(r'$\gamma$')
    ax.set_xlabel(r'$\alpha \hat{y}$')

    # unweighted
    print('plotting unweighted...')
    ax = axs[1]
    ax.hist(train_feature_vals[train_pos_ndx],
            bins=train_feature_bins,
            color='g',
            hatch='.',
            alpha=args.alpha,
            label='positive instances')
    ax.hist(train_feature_vals[train_neg_ndx],
            bins=train_feature_bins,
            color='r',
            hatch='\\',
            alpha=args.alpha,
            label='negative instances')
    ax.axvline(test_val, color='k', linestyle='--')
    ax.set_xlabel(feature_name.capitalize())
    ax.set_ylabel('Density')
    ax.set_title('Unweighted')
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
    ax.tick_params(axis='both', which='major')

    # weighted by TREX's global weights
    print('plotting weighted by global weights...')
    ax = axs[2]
    ax.hist(train_feature_vals[train_pos_ndx],
            bins=train_feature_bins,
            color='g',
            hatch='.',
            alpha=args.alpha,
            weights=train_weight[train_pos_ndx])

    ax.hist(train_feature_vals[train_neg_ndx],
            bins=train_feature_bins,
            color='r',
            hatch='\\',
            alpha=args.alpha,
            weights=train_weight[train_neg_ndx])
    ax.axvline(test_val, color='k', linestyle='--')
    ax.set_ylabel('Density')
    ax.set_xlabel(feature_name.capitalize())
    ax.set_title(r'Weighted by $\alpha \hat{y}$')
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
    ax.tick_params(axis='both', which='major')

    # weighted by TREX's global weights * similarity to the test instance
    print('plotting weighted by weight * similarity...')
    train_sim_weight = train_weight * train_sim
    ax = axs[3]
    ax.hist(train_feature_vals[train_pos_ndx],
            bins=train_feature_bins,
            color='g',
            hatch='.',
            alpha=args.alpha,
            weights=train_sim_weight[train_pos_ndx],
            label='pos samples')

    ax.hist(train_feature_vals[train_neg_ndx],
            bins=train_feature_bins,
            color='r',
            hatch='\\',
            alpha=args.alpha,
            weights=train_sim_weight[train_neg_ndx],
            label='neg samples')
    ax.axvline(test_val, color='k', linestyle='--')
    ax.legend(frameon=False)
    ax.set_ylabel('Density')
    ax.set_xlabel(feature_name.capitalize())
    ax.set_title(r'Weighted by $\alpha \hat{y} \gamma$')
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
    ax.tick_params(axis='both', which='major')

    # save plot
    out_dir = os.path.join(args.out_dir, args.tree_kernel)
    os.makedirs(out_dir, exist_ok=True)

    plt.tight_layout()

    if not args.two_col:
        fig.subplots_adjust(wspace=0.25, hspace=0.05)

    plt.savefig(os.path.join(out_dir, 'misclassification.{}'.format(args.ext)))
Beispiel #48
0
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stat

df = pd.DataFrame({
    'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
    'Salary': [50000, 54000, 50000, 189000, 55000, 40000, 59000]
})

salary = df['Salary']
salary.plot.hist(title='Salary Distribution', color='lightblue', bins=25)
plt.axvline(salary.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(salary.median(), color='green', linestyle='dashed', linewidth=2)
#plt.show()

df = pd.DataFrame({
    'Test': [
        172, 174, 176, 172, 172, 173, 176, 172, 177, 174, 176, 175, 176, 169,
        175, 174, 174, 174, 175, 173, 171, 171, 175, 175, 173, 175, 175
    ]
})
test = df["Test"]
test.plot.hist(title='Test')
density = stat.gaussian_kde(test)
n, x, _ = plt.hist(test, histtype='step', density=True, bins=10)
plt.plot(x, density(x) * 5)
plt.show()
print(df["Test"].std())
print(df['Test'].median)
print(df.describe())
def kde_estimation(data, est_x, n_data, bin_interval):
    estimator = stats.gaussian_kde(data)
    kde = estimator(est_x)
    kde = kde * n_data * bin_interval
    return kde
###############
#
# Translate R to Python Copyright (c) 2016 Masahiro Imai Released under the MIT license
#
###############
import pandas
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np
import seaborn

fish = pandas.read_csv('2-2-1-fish.csv')
print(fish.head())

kde = gaussian_kde(fish['length'])

fig = plt.figure(figsize=(10,5))

x_grid = np.linspace(0, max(fish['length']), num=100)
weights = np.ones_like(fish['length'])/float(len(fish['length']))

ax1 = fig.add_subplot(1, 2, 1)
ax1.hist(fish['length'], weights=weights)
ax1.set_xlabel('length')
ax1.set_ylabel('count')

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(np.linspace(0, np.max(fish['length'])), kde(np.linspace(0, np.max(fish['length']))))
ax2.set_xlabel('length')
ax2.set_ylabel('density')
Beispiel #51
0
def test_kde_integer_input():
    """Regression test for #1181."""
    x1 = np.arange(5)
    kde = stats.gaussian_kde(x1)
    y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
    assert_array_almost_equal(kde(x1), y_expected, decimal=6)
Beispiel #52
0
def getPz(args):
    '''
    Compute the P(z) from nearest neighbors P(z)'s
    given a reference (training) file.

    INPUT
    - fileInNames: fits file names: inputFile^trainingFile
    - keys: column names describing dimensions, example; "MAG_I,MAG_G-MAG_R,MAG_R-MAG_I"
    - selection: selection strings inputSelect^trainingSelect

    NOTES:
    - both the photoz_file and reference_file must contain
    the same column names as defined by "keys"

    OUTPUT
    - P(z)'s of input file objects + input columns if merge_with_input is set
    '''
    """
    options
    """

    # for large dataset
    sys.setrecursionlimit(100000)

    from sklearn.neighbors import KernelDensity

    verbose = True

    fileInName = args.input.split("^")

    if args.select is not None:
        fileInSelect = args.select.split("^")
    else:
        fileInSelect = [None for f in fileInName]

    if verbose: sys.stderr.write("Reading input files...")
    """
    input file
    """
    sample, sampleSelect = getCols(fileInName[0],
                                   args.keys.split(","),
                                   selection=fileInSelect[0],
                                   array=True)

    if args.keys_err is not None:
        sample_err, _ = getCols(fileInName[0],
                                args.keys_err.split(","),
                                selection=fileInSelect[0],
                                array=True)
    (Nsample, Ndim) = sample.shape
    sample[np.logical_not(np.isfinite(sample))] = 0.0  # set NaNs and inf to 0

    if Nsample == 0:
        raise ValueError("The input file is empty, exiting...")
    """
    training file
    """
    ref, select = getCols(fileInName[1],
                          args.keys_ref.split(","),
                          selection=fileInSelect[1],
                          array=True)
    if args.keys_err is not None:
        ref_err, select = getCols(fileInName[1],
                                  args.keys_err.split(","),
                                  selection=fileInSelect[1],
                                  array=True)
    (Nref, Ndim) = ref.shape
    ref[np.logical_not(np.isfinite(ref))] = 0.0  # set NaNs and inf to 0

    # for reference: histo or sum of PDFs
    if args.PDF_histo:
        # if histogram, set bins
        bins = np.linspace(0.0, 6.0, num=601, endpoint=True)
        PDF_ref_bins = 0.5 * (bins[1:] + bins[:-1])
        # get reference redshifts and weights
        # set with -keys_histo redshift,weight
        keys_histo = args.keys_histo.split(",")
        if len(keys_histo) > 1:
            #[z_ref, weight, source], select = getCols(fileInName[1], keys_histo, selection=fileInSelect[1])
            [z_ref, weight], select = getCols(fileInName[1],
                                              keys_histo,
                                              selection=fileInSelect[1])
        else:
            [z_ref], select = getCols(fileInName[1],
                                      keys_histo,
                                      selection=fileInSelect[1])
            weight = np.ones(len(z_ref))
    else:
        # if PDF, recover PDF from reference
        PDF_ref, PDF_ref_bins = getPDF(fileInName[1],
                                       normalise=args.no_norm,
                                       select=select,
                                       PDF_key="PDF_L15")
        weight = np.ones(len(PDF_ref))

    if verbose: sys.stderr.write("done\n")
    """
    Build tree of colors for reference
    """
    if verbose: sys.stderr.write("Building reference tree...")
    tree = spatial.KDTree(ref)
    if verbose: sys.stderr.write("done\n")

    # does not work:
    # import pickle
    # pickle.dump(tree, open("ref.pickle", 'w+'))
    # tree = pickle.load(open("ref.pickle", 'rb'))

    # return
    """
    main loop
    """
    # tests    Nsample = 1000

    Nnei = 50
    kde = False  # see https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/

    # output arrays
    pofz = np.zeros((Nsample, len(PDF_ref_bins)), dtype=np.float32)
    if args.PDF_histo:

        zTrain = np.zeros((Nsample, Nnei), dtype=np.float32)
        if len(keys_histo) > 1:
            wTrain = np.zeros((Nsample, Nnei), dtype=np.float32)
            # sTrain = np.zeros((Nsample, Nnei), dtype=np.float32)

    est = collections.OrderedDict()
    for name in [
            'zmean', 'zmode', 'zmedian', 'z_std', 'zl95', 'zl68', 'zh68',
            'zh95', 'z_mc', 'zconf'
    ]:
        est[name] = np.zeros(Nsample) - 99.0

    zmin = PDF_ref_bins[0]
    zmax = PDF_ref_bins[-1]

    for i in range(Nsample):
        # for i in range(1):

        # photometry failure
        if abs(np.sum(sample[i, :])) < EPS:
            continue

        # find nearest neighbors
        (d, indices) = tree.query(sample[i, :], Nnei)

        # associated errors
        derr = np.ones(Nnei)
        if args.keys_err is not None:

            for n, j in enumerate(indices):
                err = np.sum(sample_err[i, :] + ref_err[j, :])
                if err > EPS:
                    derr[n] = err

        # weight: 1/distance * 1/err * input_weight
        w = np.ones(len(indices))
        for n, j in enumerate(indices):
            if d[n] > EPS:
                w[n] = 1.0 / d[n] * 1.0 / derr[n] * weight[j]

        if args.PDF_histo:

            zTrain[i, :] = z_ref[indices]
            if len(keys_histo) > 1:
                wTrain[i, :] = w
                # sTrain[i, :] = source[indices]

            if kde:

                z_weighted = weightedSample(z_ref[indices], w)

                std = np.std(z_weighted)

                if (std < 1.e-4) | (len(z_weighted) < 2):
                    pofz[i, :], _ = np.histogram(z_ref[indices],
                                                 bins=bins,
                                                 density=True,
                                                 weights=w)
                else:
                    density = gaussian_kde(z_weighted, bw_method=0.03 / std)
                    pofz[i, :] = density.pdf(PDF_ref_bins)

                    # density = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(z_weighted[:, np.newaxis])
                    # pofz[i, :] = density.score_samples(PDF_ref_bins[:, np.newaxis])

            else:
                pofz[i, :], _ = np.histogram(z_ref[indices],
                                             bins=bins,
                                             density=True,
                                             weights=w)
        else:
            for n, j in enumerate(indices):
                if (d[n] > EPS) & (np.sum(PDF_ref[j, :]) > EPS):
                    pofz[i, :] += PDF_ref[j, :] * w[n]

        # PDF_sample_inter = np.interp(PDF_ref_bins, PDF_sample_bins, PDF_sample[i,:])
        # pofz[i,:] *= PDF_sample_inter

        # normalise PDF
        norm = int_trapz(PDF_ref_bins, pofz[i, :], PDF_ref_bins[0],
                         PDF_ref_bins[-1])
        if norm > EPS:
            pofz[i, :] /= norm

            est['zmean'][i] = int_trapz(PDF_ref_bins,
                                        pofz[i, :] * PDF_ref_bins, zmin, zmax)
            est['zmode'][i] = max_pos_PDF(PDF_ref_bins, pofz[i, :])
            est['zmedian'][i] = medianfromDist(PDF_ref_bins, pofz[i, :])
            est['z_std'][i] = np.sqrt(
                int_trapz(
                    PDF_ref_bins,
                    pofz[i, :] * pow(PDF_ref_bins - est['zmean'][i], 2.0),
                    zmin, zmax))
            est['zl95'][i] = sampleFromDist(PDF_ref_bins,
                                            pofz[i, :],
                                            q=0.05 / 2.0)
            est['zl68'][i] = sampleFromDist(PDF_ref_bins,
                                            pofz[i, :],
                                            q=0.32 / 2.0)
            est['zh68'][i] = sampleFromDist(PDF_ref_bins,
                                            pofz[i, :],
                                            q=1.0 - 0.32 / 2.0)
            est['zh95'][i] = sampleFromDist(PDF_ref_bins,
                                            pofz[i, :],
                                            q=1.0 - 0.05 / 2.0)
            est['z_mc'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :])
            est['zconf'][i] = int_trapz(
                PDF_ref_bins, pofz[i, :],
                est['zmedian'][i] - 0.03 * (1.0 + est['zmedian'][i]),
                est['zmedian'][i] + 0.03 * (1.0 + est['zmedian'][i]))

        # for name in est.keys():
        # print "{0:s}:{1}".format(name,est[name][i])

        # test
        #z_weighted = weightedSample(PDF_ref_bins[pofz[i,:]>EPS], pofz[i,:][pofz[i,:]>EPS])
        #density = gaussian_kde(z_weighted)
        #pofz_KDE = density.pdf(PDF_ref_bins)
        #z_median[i] = medianfromDist(PDF_ref_bins, pofz_KDE)

        if verbose:
            if (i + 1) % 1000 == 0:
                sys.stderr.write("\r" +
                                 "P(z): computed {0:d} objects".format(i + 1))
                sys.stderr.flush()

    if verbose:
        sys.stderr.write("\r" + "P(z): computed {0:d} objects\n".format(i + 1))
    """
    write output file
    """
    if verbose: sys.stderr.write("Writing output file...")
    cols = []

    if args.key_id is not None:
        [ID], _ = getCols(fileInName[0], [args.key_id], select=fileInSelect[0])
        cols.append(fits.Column(name="ID", format='K', array=ID))

    for name in est.keys():
        cols.append(fits.Column(name=name, format='E', array=est[name]))

    #cols.append(fits.Column(name='zmedian', format='E', array=zmedian))
    cols.append(
        fits.Column(name='PDF',
                    format=str(len(PDF_ref_bins)) + 'E',
                    array=pofz))

    if args.PDF_histo:
        cols.append(
            fits.Column(name='zTrain', format=str(Nnei) + 'E', array=zTrain))
        if len(keys_histo) > 1:
            cols.append(
                fits.Column(name='wTrain',
                            format=str(Nnei) + 'E',
                            array=wTrain))
            # cols.append(fits.Column(name='sTrain',      format=str(Nnei)+'I', array=sTrain))

    cols_bins = []
    # cols_bins.append(fits.Column(name='PDF', format=str(len(PDF_ref_bins))+'E', array=[PDF_ref_bins]))
    cols_bins.append(fits.Column(name='BINS', format='E', array=PDF_ref_bins))
    # cols_bins.append(fits.Column(name='Z_MIN', format='E', array=[PDF_ref_bins[0]]))
    # cols_bins.append(fits.Column(name='Z_MAX', format='E', array=[PDF_ref_bins[-1]]))
    # cols_bins.append(fits.Column(name='DELTA_Z', format='E', array=[PDF_ref_bins[1]-PDF_ref_bins[0]]))

    if args.merge_with_input:
        fileIn = fits.open(fileInName[0])
        hdu_0 = fileIn[0]
        if sampleSelect is not None:
            for c in fileIn[1].columns:
                cols.append(
                    fits.Column(name=c.name,
                                format=c.format,
                                array=fileIn[1].data[c.name][sampleSelect]))
            hdu_1 = fits.BinTableHDU.from_columns(fits.ColDefs(cols))
        else:
            hdu_1 = fits.BinTableHDU.from_columns(fileIn[1].columns +
                                                  fits.ColDefs(cols))
        if len(fileIn) > 2:
            hdu_2 = fits.BinTableHDU.from_columns(fileIn[2].columns +
                                                  fits.ColDefs(cols_bins))
        else:
            hdu_2 = fits.BinTableHDU.from_columns(fits.ColDefs(cols_bins))
    else:
        hdu_0 = fits.PrimaryHDU()
        hdu_1 = fits.BinTableHDU.from_columns(fits.ColDefs(cols))
        hdu_2 = fits.BinTableHDU.from_columns(fits.ColDefs(cols_bins))

    hdu_1.header["EXTNAME"] = "DATA"
    hdu_2.header["EXTNAME"] = "BINS"

    if args.PDF_histo:
        hdu_1.header["z_min"] = bins[0]
        hdu_1.header["z_max"] = bins[-1]
        hdu_1.header["delta_z"] = bins[1] - bins[0]
    else:
        hdu_1.header["z_min"] = PDF_ref_bins[0]
        hdu_1.header["z_max"] = PDF_ref_bins[-1]
        hdu_1.header["delta_z"] = PDF_ref_bins[1] - PDF_ref_bins[0]

    tbhdu = fits.HDUList([hdu_0, hdu_1, hdu_2])
    tbhdu.writeto(args.output, overwrite=True)

    if args.merge_with_input:
        fileIn.close()

    if verbose: sys.stderr.write("done\n")

    if args.plot:

        [zs], _ = getCols(fileInName[0], ["redshift"], select=fileInSelect[0])
        print "Stats (scatter, eta, bias, eta_2sig, N) =", stats(
            est['zmedian'], zs, [0.0, 6.0])

        # PDF_sample, PDF_sample_bins = getPDF(fileInName[0], normalise=args.no_norm, PDF_key="PDF_lephare")
        # plotPDF(pofz, PDF_ref_bins, Nsample,  "PDF.pdf",  zs=zs,  zp=est['zmedian'], PDF2=[PDF_sample, PDF_sample_bins, "lephare"])
        plotPDF(pofz,
                PDF_ref_bins,
                Nsample,
                "PDF.pdf",
                zs=zs,
                zp=est['zmedian'])

    return
Beispiel #53
0
import matplotlib
matplotlib.rc('font', family='Arial')
import pickle
import numpy as np

with open('flu_preds.pkl','rb') as f:
    antigen,predicted,Y = pickle.load(f)

# with open('ebv_preds.pkl','rb') as f:
#     antigen,predicted,Y = pickle.load(f)
#
# with open('mart1_preds.pkl','rb') as f:
#     antigen,predicted,Y = pickle.load(f)

x = predicted
y = Y
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
r = np.argsort(z)
x, y, z = x[r], y[r], z[r]
plt.figure(figsize=(6,5))
plt.scatter(x, y, s=15, c=z, cmap=plt.cm.jet)
plt.title(antigen, fontsize=18)
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.xlabel('Predicted', fontsize=24)
plt.ylabel('Log2(counts+1)', fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.subplots_adjust(bottom=0.15)
plt.savefig(antigen+'.png',dpi=1200)
def kde(x, x_grid, bandwidth=0.2):
    "Kernel-Density Estimate using Gaussian Kernels."
    kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1))
    return kde.evaluate(x_grid)
    else:
        # We had to give up looking for valid points near refpt, so remove it
        # from the list of "active" points.
        active.remove(idx)


#print(samples)
def column(matrix, i):
    return [row[i] for row in matrix]


datax = column(samples, 0)
datay = column(samples, 1)
data = np.array(samples)

kde_uni = stats.gaussian_kde(uniform_noise.sample().T)
density_uni = kde_uni(uniform_noise.sample().T)
#normalize_density = density/max(density)

kde = stats.gaussian_kde(data.T)
density = kde(data.T)
normalize_density = density / max(density)

cmap = cm.jet  #cm.hot #'Blues'

counts, xedges, yedges = np.histogram2d(datax, datay, bins=(60, 45))
#print(counts.shape)
#print(np.amax(counts))
#print(counts)
xidx = np.clip(np.digitize(datax, xedges), 0, counts.shape[0] - 1)
yidx = np.clip(np.digitize(datay, yedges), 0, counts.shape[1] - 1)
Beispiel #56
0
def plot_densities(x, nbins=25, tit=''):
    for xx in x:
        density = stats.gaussian_kde(xx)
        h = np.histogram(xx, bins=get_bins(xx, nbins)[1:-1])[1]
        plt.plot(h, density(h))
    plt.title(tit)
Beispiel #57
0
def KDE(data):
    kernel = gaussian_kde(dataset=data, bw_method='silverman')
    return kernel
def _get_kdes(train_ats, train_pred, class_matrix, args):
    """Kernel density estimation

    Args:
        train_ats (list): List of activation traces in training set.
        train_pred (list): List of prediction of train set.
        class_matrix (list): List of index of classes.
        args: Keyboard args.

    Returns:
        kdes (list): List of kdes per label if classification task.
        removed_cols (list): List of removed columns by variance threshold.
    """
    removed_cols = []
    if args.is_classification:
        for label in range(args.num_classes):
            col_vectors = np.transpose(train_ats[class_matrix[label]])
            if args.d == 'imagenet' and (args.model == 'densenet201'
                                         or args.model == 'efficientnetb7'):
                continue
            else:
                for i in range(col_vectors.shape[0]):
                    if (np.var(col_vectors[i]) < args.var_threshold
                            and i not in removed_cols):
                        removed_cols.append(i)

        # import pdb; pdb.set_trace()
        kdes = {}
        for label in tqdm(range(args.num_classes), desc="kde"):
            refined_ats = np.transpose(train_ats[class_matrix[label]])
            if args.d == 'imagenet' and (args.model == 'densenet201'
                                         or args.model == 'efficientnetb7'):
                pass
            else:
                refined_ats = np.delete(refined_ats, removed_cols, axis=0)

            if refined_ats.shape[0] == 0:
                print(
                    warn("ats were removed by threshold {}".format(
                        args.var_threshold)))
                break
            kdes[label] = gaussian_kde(refined_ats)
            # import pdb; pdb.set_trace()
            # print(gaussian_kde(refined_ats))
        import pdb
        pdb.set_trace()
    else:
        col_vectors = np.transpose(train_ats)
        for i in range(col_vectors.shape[0]):
            if np.var(col_vectors[i]) < args.var_threshold:
                removed_cols.append(i)

        refined_ats = np.transpose(train_ats)
        refined_ats = np.delete(refined_ats, removed_cols, axis=0)
        if refined_ats.shape[0] == 0:
            print(
                warn("ats were removed by threshold {}".format(
                    args.var_threshold)))
        kdes = [gaussian_kde(refined_ats)]

    print(infog("The number of removed columns: {}".format(len(removed_cols))))

    return kdes, removed_cols
Beispiel #59
0
sig = dyRet.std()[0]

yRet = np.array(dyRet)

plt.subplot(221)
plt.hist(yRet, normed=True, bins=100, color='grey')
distance = np.linspace(min(yRet), max(yRet))
plt.plot(distance, norm.pdf(distance, mu, sig), c='r')
plt.xlabel('log return')
plt.ylabel('density')
plt.legend(loc="upper right", fontsize=5)

plt.subplot(222)
yNRet = (yRet - mu) / sig  #standardization
distanceN = np.squeeze(np.linspace(min(yNRet), max(yNRet)))
kernel = gaussian_kde(np.squeeze(yNRet))
plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r')
plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey')
plt.legend(loc="upper right", fontsize=5)

plt.subplot(223)
plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r')
plt.plot(distanceN, t.pdf(distanceN, df=2), label='t-dist, df=2', c='g')
plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey')
plt.legend(loc="upper right", fontsize=5)

plt.subplot(224)
plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r')
plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey')
plt.plot(distanceN, t.pdf(distanceN, df=2), label='t-dist, df=2', c='g')
plt.plot(distanceN, laplace.pdf(distanceN), label='laplace-dist', c='y')
Beispiel #60
0
MPL_plateifu = Table_mpl5['PLATEIFU'].astype('object')
MPL_ID = [ID.strip() for ID in MPL_plateifu]
Table_weights = pd.DataFrame({'plateifu':MPL_ID,'weight':Weights})
data_SF_w = pd.merge(Table_weights, data_SF)
P0_w = data_SF_w[data_SF_w.mode_flag!=0]
P0_w = P0_w[P0_w.re_arc>2.5]
P1_w, P2_w = P0_w[P0_w.mode_flag==1], P0_w[P0_w.mode_flag==-1]

#--------Merging Measurements Over--------#

# Use 80% data to fit
W = data_SF[data_SF.mode_flag!=0]
xx, yy = np.mgrid[5:11:60j, -5:1:60j]
positions = np.vstack([xx.ravel(), yy.ravel()])
W0 = W.sample(frac=0.1)
kernel = gaussian_kde(np.vstack([W0.Sigma_Mass, W0.Sigma_SFR]))
print "Use KDE to derive PDF... %d points used."%len(W0)
pdf = pd.Series(kernel.pdf(np.vstack([W.Sigma_Mass, W.Sigma_SFR])))
use_80 = (pdf > pdf.quantile(0.2)).values
W_80 = W[use_80]
print "Select 80% of data... Finish!"

# plot Fig.2
mp,sfrp,cof = median_fitting(P0.Sigma_Mass, P0.Sigma_SFR, q=0.01,d=7)
m_i,sfr_i,cof_i = median_fitting(P1.Sigma_Mass, P1.Sigma_SFR, q=0.01,d=7)
m_o,sfr_o,cof_o = median_fitting(P2.Sigma_Mass, P2.Sigma_SFR, q=0.01,d=7)

plt.figure(figsize=(15,5.))
for i, (W,c,cmap,lab,p) in enumerate(zip([P0,P1,P2],['k','r','b'],['Greys','Reds','Blues'],['Total','Inside-out','Outside-in'],['a','b','c'])):
	ax = plt.subplot(1,3,i+1)
	plt.text(0.018,0.05,"%s)"%p,fontsize='large',transform=ax.transAxes)