Beispiel #1
0
class OneClassKDE(BaseClassifier):
    _fit_params = ["bandwidth"]
    _predict_params = []
    def __init__(self, *args, **kwargs):
        self.bandwidth = kwargs["bandwidth"]
        self.perc_keep = kwargs["perc_keep"]
    
    def fit(self, data, **kwargs):
        #self.train_data = data
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)
        
        idx = numpy.random.randint(2, size=len(data)).astype(numpy.bool)
        print idx
        
        
        self.kde.fit(data[idx, :])
        self.training_score = self.kde.score_samples(data[~idx, :])
        self.direct_thresh = numpy.percentile(self.training_score, 100-self.perc_keep)
        
        print 'training', self.training_score.min(), self.training_score.mean(), self.training_score.max(), self.direct_thresh
        
        print self.direct_thresh
    
    def predict(self, data):
        score = self.kde.score_samples(data)
        self.score = score
        res = (score < self.direct_thresh)
        print 'test', self.score.min(), self.score.mean(), self.score.max()
        print res.sum(), "of", len(self.score), 'outliers'
        
        return res.astype(numpy.uint8)*-2+1
    
    def decision_function(self, data=None):
        return self.score
Beispiel #2
0
def kernel_estimation(test,train_n,train_p):    
    relevance_score=[]
    result_n=[]
    result_p=[]   

    X_n=np.array(train_n)   
    X_p=np.array(train_p)
    Y=np.array(test)
    
    #params = {'bandwidth': np.logspace(-1, 1, 20)}
    #grid = GridSearchCV(KernelDensity(), params)
    #grid.fit(X_n)
    
    #print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))    
    
    kde_n = KernelDensity(kernel='gaussian', bandwidth=0.999).fit(X_n)
    kde_p = KernelDensity(kernel='gaussian', bandwidth=4.772).fit(X_p)
    for i in range(len(Y)):  
        result_n.append(np.exp(float(str(kde_n.score_samples(Y[i])).replace('[','').replace(']',''))))
        result_p.append(np.exp(float(str(kde_p.score_samples(Y[i])).replace('[','').replace(']',''))))
        if i%1000==0:
            print i      
    
    for i in range(len(result_n)): 
        if result_n[i]==0.0:
            relevance_score.append(np.log(result_p[i]/1.8404e-17+1))
        else:
            relevance_score.append(np.log(result_p[i]/result_n[i]+1))

    return relevance_score
Beispiel #3
0
def kernel_pmi_func(df, x, y, i, b=1.0):
    x = np.array(df[x])
    y = np.array(df[y])
    x_y = np.stack((x, y), axis=-1)
    
    kde_x = KernelDensity(kernel='gaussian', bandwidth=b).fit(x[:, np.newaxis])
    kde_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(y[:, np.newaxis])
    kde_x_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(x_y)
    
    p_x = pd.Series(np.exp(kde_x.score_samples(x[:, np.newaxis])))
    p_y = pd.Series(np.exp(kde_y.score_samples(y[:, np.newaxis])))
    p_x_y = pd.Series(np.exp(kde_x_y.score_samples(x_y)))   
    
    df['PMI_'+str(i)] = np.log( p_x_y / (p_x * p_y) )
Beispiel #4
0
def kernel_reg(df, bw=30, indepv="ARRTIME", kernel='gaussian'):
    '''
    Make various KDEs that using 3 different kernels and bandwidths 
    
    Inputs:
        df: pandas dataframe
        indepv: (a list) of strings of the indepedent variable names
        bw: (float) bandwidth
        
    Run kernel regression and print out the scores for comparison
    '''
    y = df["WAITTIME"]
    x = df[indepv]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=42)

    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis=1)
    #kernel density estimations

    # instantiate and fit the KDE model
    kde = KernelDensity(kernel=kernel, bandwidth=bw).fit(train)
    # score_samples returns the log of the probability density
    logprob = kde.score_samples(test)
    plt.scatter(y_test, logprob, label='%s, bw=%s' % (kernel, bw))
    plt.legend(loc=0)
    plt.show()
def kernel_fit_single(data, bw=None, min_size=20, kern='gaussian'):
    """ guassian fit to 1D data
    """
    res = np.histogram(data.ravel(), bins='sqrt', density=True)
    std_data = data.std()
    if (bw == None):
        bw = (data.ravel().shape[0] * (std_data + 2) / 4.)**(-1. /
                                                             (std_data + 4))

    N_bins = res[1].shape[0]
    if (N_bins < min_size):
        extra = 0.2
        #N_bins *=2
    else:
        extra = 0.0
    # get plus or minus 20%

    x_grid = np.linspace(res[1][0] - extra * abs(res[1][0]),
                         res[1][-1] + extra * abs(res[1][0]), N_bins)

    kde = KernelDensity(bandwidth=bw, kernel=kern)
    kde.fit(data.ravel()[:, None])

    pdf = np.exp(kde.score_samples(x_grid[:, None]))

    return pdf, x_grid
Beispiel #6
0
def kde_labeler(picks):
    if isinstance(picks, torch.Tensor):
        picks = picks.clone().cpu().data.numpy().astype(int)
    nums = np.array([x for x in range(0, 101)]).reshape(-1, 1)
    picks = picks.reshape(-1, 1)
    lower = np.percentile(picks, 25)
    upper = np.percentile(picks, 75)
    IQR = upper - lower
    std = picks.std()
    if std < 0.5:
        std = 1.0
        IQR = 1.0

    if IQR < 0.1:
        IQR = 0.1
    m = min(np.sqrt(std * std), IQR / 1.349)
    bandwidth = (0.9 * float(m)) / (float(pow(float(len(picks)), 0.2)))

    if bandwidth > 5:
        # TODO: Handle this in a manner not using print statements. Maybe set a warning flag
        print(
            f"Bandwidth too high! m: {m} std: {std} IQR: {IQR} bandwidth: {bandwidth}"
        )

    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(picks)

    log_dens = kde.score_samples(nums)
    label = np.exp(log_dens)
    label = label / label.sum()
    return label
Beispiel #7
0
def kde():
    k, v = [], []
    score_kde, u = {}, 0
    result, frequency, lat, lon = ope_file(
        path="C:/Users\AMITY UNIVERSITY\Desktop\QWER\TEST_DATA")
    mlat, mlon, path, north, south, east, west, cout, od, centroid, countdiction, pathdistion, sqcout = pathlist(
        lat, lon)
    for key, value in centroid.items():
        print(value)
        if input():
            pass
        cou = 0
        for i in zip(result, frequency):
            print(haversine(value[0], value[1], i[0][0], i[0][1]))
            X = np.array([i[0][0], i[0][1]])
            kde = KernelDensity(kernel='gaussian',
                                bandwidth=0.3).fit([[i[0][0], i[0][1]]])
            cou += kde.score_samples([centroid[key]]) * i[1]
            print(cou)
        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
        score_kde[key] = cou
        k.append(key)
        v.append(cou)
    li = colo(v)
    print(score_kde, k, v, li)
    return mlat, mlon, path, north, south, east, west, cout, od, centroid, countdiction, pathdistion, sqcout
Beispiel #8
0
class KDECluster:
    '''
    points is a vector of vectors [[],[]]
    '''
    def __init__(self, points, bw):
        if len(points) < 5:
            self.kde_ = KernelDensity(kernel='gaussian', bandwidth=bw)
        else:
            self.kde_ = KernelDensity(kernel='epanechnikov',
                                      algorithm='ball_tree',
                                      bandwidth=bw,
                                      leaf_size=50)

        self.points_ = points

        self.kde_.fit(points)

    #..........................................................................
    def compare(self, cluster):
        scores_self = np.exp(self.kde_.score_samples(cluster.points_))
        scores_clus = np.exp(cluster.kde_.score_samples(self.points_))

        m_self = max(scores_self)
        m_clus = max(scores_clus)

        return max(m_clus, m_self)
def plot_kde(obj, lo, hi, true, test):
    obj_plot = np.linspace(lo, hi, 10000)[:, np.newaxis]
    avg_std = np.mean(std(obj))
    bandwidth = 1.06 * avg_std * len(obj)**-0.2
    plt.figure()
    #    ax = plt.gca()
    for i in range(obj.shape[1]):
        a = obj[:, i][:, np.newaxis]
        #1.06*np.std(a)*len(a)**-0.2 # Bandwidth estimated by Silverman's Rule of Thumb
        kde = KernelDensity(bandwidth=bandwidth,
                            kernel='gaussian',
                            algorithm='ball_tree')
        kde.fit(a)
        log_dens = kde.score_samples(obj_plot)
        plt.plot(obj_plot, np.exp(log_dens))
#        vline_color = next(ax._get_lines.prop_cycler)['color']
#        plt.axvline(np.mean(a), linestyle=':', color = vline_color, label='Update %i' %(i+1))
    plt.axvline(np.mean(average(obj)),
                color='red',
                label='Mean of all predictions')
    plt.axvline(true,
                label='True value',
                linestyle='dashdot',
                color='black',
                linewidth=2)
    plt.ylabel('PDF')
    plt.xlabel('Cycle')
    plt.tight_layout()
    plt.legend()
Beispiel #10
0
def _importance_preprocess_uni(states, rewards, gradients, p_tar, p_gen):
    res = _create_episode_info()

    flat_states = [s for traj in states for s in traj]
    # TODO Pass in as args?
    kde = KernelDensity(kernel='gaussian', bandwidth=0.25)
    kde.fit(flat_states)

    for ss, rs, gs, ps, qs in izip(states, rewards, gradients, p_tar, p_gen):

        state_probs = kde.score_samples(ss)
        traj_p = np.cumsum(ps)  # + np.mean(state_probs)
        traj_q = np.cumsum(qs) + state_probs
        traj_grads = np.cumsum(gs, axis=0)
        r_acc = np.cumsum(rs[::-1])[::-1]
        r_grad = (r_acc * traj_grads.T).T

        res.r_grads.extend(r_grad)
        res.traj_p_tar.extend(traj_p)
        res.traj_p_gen.extend(traj_q)
        res.traj_grads.extend(traj_grads)
        res.traj_r.extend(r_acc)

        # Used for estimating fisher
        res.act_grads.extend(gs)
        res.state_act_p_tar.extend(traj_p)
        res.state_act_p_gen.extend(traj_q)

    return res
Beispiel #11
0
def basic_properties( sequences , axess=None, labl = None, logscale=[False], markr='.', clr='k',offset=0, alfa = 0.8,
                      distir = [False,False,False, False], bandwidths = [3, 0.1,0.01,1], limits = [(1,50),(0,1),(0,1),(1,25)] ):
    if axess is None:
        fig,axess = plt.subplots( 3, len(sequences),False,False, squeeze=False,figsize=(len(sequences)*3,8))#'col'
    plt.subplots_adjust(left=0.12, bottom=0.05, right=0.95, top=0.94,   wspace=0.28, hspace=0.1)
    plt.subplots_adjust(left=0.45, bottom=0.05, right=0.95, top=0.94,   wspace=0.28, hspace=1.2)

    for i in range(0,len(sequences)):
        ax = axess[offset][i]
        seq = sequences[i]
        smax =max(seq)
        smin =min(seq)

        if distir[i]==0:
            #print seq
            freqs , bin_edges = np.histogram(seq,  smax+1 if smax>1 else 100, range = (0,smax+1) if smax>1 else (0,smax))#, normed = True, density=True)
            bin_centers =  (bin_edges[:-1] + bin_edges[1:])/2.
            vals = range(0,smax+1) if smax>1 else bin_centers
            freqs=freqs*1.0/sum(freqs)
            #remove zeros
            y = np.array(freqs)
            nz_indexes = np.nonzero(y)
            y = y[nz_indexes]
            x = np.array(vals)[nz_indexes]
            ax.plot(x, y,':', label=labl, alpha =alfa, color = clr ,  marker ='.')
        else :
            X = np.array(seq)
            X = [ x for x in X if x>=limits[i][0] and x<=limits[i][1]]
    #         X= (np.abs(X))
#             print len(X)
            X = np.random.choice(X, size=min(10000, len(X)))
            X = X[:, np.newaxis]
            kde = KernelDensity(kernel = 'gaussian', bandwidth=bandwidths[i]).fit(X)#,atol=atols[i],kernel = 'tophat'kernel='gaussian'
#             if 'x' in logscale[i] : 
#                 X_plot = np.logspace( limits[i][0],  limits[i][1], 1000)[:, np.newaxis]
#             else :
            X_plot = np.linspace(limits[i][0], limits[i][1], 1000)[:, np.newaxis]
    
            log_dens = kde.score_samples(X_plot) #
    #         ax.fill(X_plot[:, 0], np.exp(log_dens), alpha =0.5, label=labl)
            Y  =  np.exp(log_dens)
            if  distir[i]==2: Y = np.cumsum(Y)
            ax.plot(X_plot[:, 0],Y, '-',label=labl, alpha =alfa, color = clr ,markersize=2,  marker ='')
    
            verts = [(limits[i][0]-1e-6, 0)] + list(zip(X_plot[:, 0],Y)) + [(limits[i][1]+1e-6, 0)]
            poly = Polygon(verts, facecolor=clr,  alpha =alfa ) #, edgecolor='0.5')
            ax.add_patch(poly)
    #         ax.set_yticks([])
    #         ax.set_ylim(bottom=-0.02)
            ax.set_xlim(limits[i][0],limits[i][1])
            
        if len(logscale)==len(sequences): 
            if 'x' in logscale[i] : 
                ax.set_xscale('log')
            if 'y' in logscale[i] : 
                ax.set_yscale('log')
                if i<3: ax.set_ylim(bottom=0.001)
#         ax.legend()
#         plt.show(block=False)
    return axess
Beispiel #12
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
    def fitness(self,individual, markets):
        portfo_return = np.zeros(len(markets[0]))
        for j in range(len(markets[0])):
            portfo_return[j] = np.dot(np.array(individual) , np.array(markets).T[j])

        ret = portfo_return
        X = ret[:, np.newaxis]
        X_plot = np.linspace(min(ret),max(ret), 200)[:, np.newaxis]
        kde = KernelDensity(kernel='gaussian', bandwidth=self.band_width).fit(X)
        log_dens = kde.score_samples(X_plot)
        pdf = np.exp(log_dens)

        sec_dev = np.diff(pdf,2)
        qpot = [500.0]
        for i in range(1,len(pdf)-1):
            if pdf[i] > 0.0001:
                jj = sec_dev[i-1]/pdf[i]
            else:
                jj=500
            qpot.append(jj) 
        qpot.append(500)
        # dd = X_plot[argrelextrema(qpot, np.greater)]
        # risk = dd[dd>0][0] - dd[dd<0][-1]

        xx =[]
        x = X_plot.reshape(len(qpot))
        for i in range(len(qpot)):
            if qpot[i] >= 499:
                xx.append(i)
        x_list = np.array(x)[xx]
        d_lim = x_list[x_list<0][-1]
        u_lim = x_list[x_list>0][0]

        return u_lim-d_lim
def fitKDE(obs, bWidth=0.25, kernel='gaussian', x=None):
    """
    Fit observation with Kernel Density Estimation Methods
    Snippet 2.2 Testing the Marcdnko-Pastur Theorem
    Args:
        obs:
        bWidth:
        kernel:
        x:

    Returns:

    """
    # Fit kernel to a series of obs, and derive the probability of obs
    # x : the array of values on which the fit KDE will be evaluated

    if len(obs.shape) == 1:
        obs = obs.reshape(-1, 1)

    if x is None:
        x = np.unique(obs).reshape(-1, 1)

    if len(x.shape) == 1:
        x = x.reshape(-1, 1)

    kde = KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs)

    logProb = kde.score_samples(x)  # log(density)
    pdf = pd.Series(np.exp(logProb), index=x.flatten())

    return pdf
Beispiel #15
0
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]):  
    
    x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1)
    y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1)
    x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 
                          for b in range(N_grid)])
    y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 
                          for b in range(N_grid)])
    x_grid, y_grid = np.meshgrid(x_centres,y_centres)
    xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy)
    H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid))
    # this bit is taken from the corner_plot.py method.
    ######################################
    Hflat = H.flatten()
    inds = np.argsort(Hflat)[::-1]
    Hflat = Hflat[inds]
    sm = np.cumsum(Hflat)
    sm /= sm[-1]
    V = np.empty(len(levels))
    for i, v0 in enumerate(levels):
        try:
            V[i] = Hflat[sm <= v0][-1]
        except:
            V[i] = Hflat[0]
    #####################################
    V = np.sort(V)
    
    return H, V, x_grid, y_grid
Beispiel #16
0
def fitKDE(lst_obs,
           flt_bandwidth=0.25,
           str_kernel="gaussian",
           lst_x_eval=None):
    """
    Function that fits a (given) Kernel Density Estimator to a series of observations, and derive the probability of observation
    :param lst_obs: the list of observations
    :param flt_bandwidth: the bandwidth of the kernel
    :param str_kernel: the kernel to use
    :param lst_x_eval: array of values on which the fitted KDE will be evaluated
    return: dtf_pdf: empirical pdf
    """
    # List of observations lst_obs must be a 2-dimensional array
    if len(lst_obs.shape) == 1:
        lst_obs = lst_obs.reshape(-1, 1)

    # Initialize the KDE and fit it on the observations
    skl_kde = KernelDensity(bandwidth=flt_bandwidth,
                            kernel=str_kernel).fit(lst_obs)

    # List lst_x_eval must be a 2-dimensional array too
    # If lst_x_eval it's not provided, let's initialize it as the list of unique observations
    if lst_x_eval is None:
        lst_x_eval = np.unique(lst_obs).reshape(-1, 1)

    if len(lst_x_eval.shape) == 1:
        lst_x_eval = lst_x_eval.reshape(-1, 1)

    # Evaluate the log density model on the data (i.e., on lst_x_eval)
    lst_logProb = skl_kde.score_samples(X=lst_x_eval)

    # Return the evaluations as pandas series
    dtf_pdf = pd.Series(data=np.exp(lst_logProb), index=lst_x_eval.flatten())

    return dtf_pdf
def kernel_density_estimate(histogram, num_samples, bandwidth):
	kernel = 'gaussian'
	# kernel = 'epanechnikov'
	kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(histogram.reshape(-1, 1))
	# kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(a)
	# s = np.linspace(0, 50)
	# v = np.linspace(0,10,10)
	# plt.plot(v, a)
	# plt.show()
	# wid = 831
	s = np.linspace(0, num_samples, num_samples)
	# e = kde.score_samples(s.reshape(-1, 1))
	e = kde.score_samples(s.reshape(-1, 1))
	prb = np.exp(e)

	# plt.figure()
	# print('e', e.shape, e)
	# print('prb', prb.shape, prb)
	'''reduce '''
	# reduced_hist = np.unique(hist)
	# num_bin = len(reduced_hist)

	# print('num_bin', num_bin)


	'''extremas of probability density'''
	mi, ma = argrelextrema(prb, np.less)[0], argrelextrema(prb, np.greater)[0]
	# print("Minima:", mi)
	# print("Maxima:", ma)
	return mi,ma, s, prb
Beispiel #18
0
    def _density(self):
        d = list()
        h = dict()
        bw = self.np.size**(-1. / 5)
        kd = KernelDensity(kernel='gaussian',
                           bandwidth=bw).fit(self.np.reshape(-1, 1))
        kd_vals = np.exp(kd.score_samples(self.np.reshape(-1, 1)))

        for i, x in enumerate(kd_vals):
            h[self.np[i]] = x
        for x in sorted(h):
            dp = h[x] * (0.4 / max(kd_vals))
            d.append([x, self.seriesId - dp, self.seriesId + dp])
        return {
            'id': str(self.seriesId),
            'name': self.seriesName,
            'type': 'areasplinerange',
            'enableMouseTracking': False,
            'marker': {
                'symbol': 'circle',
                'enabled': False
            },
            'color':
            'Highcharts.getOptions().colors[' + str(self.seriesId) + ']',
            'data': d
        }
Beispiel #19
0
    def kernel_density(self):

        pre_data = self.data

        start = np.array(pre_data)

        start_len = len(start)

        resolution = np.linspace(0, 1, num=10).tolist()

        pre_data = np.histogram(pre_data, bins=resolution)[0]

        pre_data = pre_data / max(pre_data)

        pre_data = np.array([int(i*100) for i in pre_data.tolist()])

        initial_length = int(len(pre_data) * 2) # 2 is an arbitary good number to use

        a = pre_data.reshape(-1, 1)

        kde = KernelDensity(kernel='gaussian', bandwidth=2).fit(a)
        s = np.linspace(0, initial_length)
        e = kde.score_samples(s.reshape(-1, 1))

        lower_boundaries = argrelextrema(e, np.less)[0]

        minima = s[lower_boundaries]

        demodulated_index = [int((i/initial_length)*start_len) for i in minima]

        return start[np.array(demodulated_index)]
Beispiel #20
0
def plot_dos(phonons,
             bandwidth=.05,
             n_points=200,
             is_showing=True,
             input_fig=None):
    if input_fig is None:
        fig = plt.figure()
    else:
        fig = input_fig
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(
        phonons.frequency.flatten(order='C').reshape(-1, 1))
    x = np.linspace(phonons.frequency.min(), phonons.frequency.max(), n_points)
    y = np.exp(kde.score_samples(x.reshape((-1, 1))))
    plt.plot(x, y)
    plt.fill_between(x, y, alpha=.2)
    plt.xlabel("$\\nu$ (THz)", fontsize=16)
    plt.ylabel('DOS', fontsize=16)
    plt.tick_params(axis='both', which='major', labelsize=16)
    plt.tick_params(axis='both', which='minor', labelsize=16)
    folder = get_folder_from_label(phonons, base_folder=DEFAULT_FOLDER)
    if not os.path.exists(folder):
        os.makedirs(folder)
    fig.savefig(folder + '/' + 'dos.png')
    if is_showing:
        plt.show()
    elif input_fig is None:
        plt.close()
    else:
        return fig
Beispiel #21
0
def kde_example():

    # ----------------------------------------------------------------------
    # Plot a 1D density example
    N = 100
    np.random.seed(1)
    X = np.concatenate((np.random.normal(0, 1, 0.3 * N),
                        np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis]

    X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]

    true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0])
                 + 0.7 * norm(5, 1).pdf(X_plot[:, 0]))

    fig, ax = plt.subplots()
    ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2,
            label='input distribution')

    for kernel in ['gaussian', 'tophat', 'epanechnikov']:
        kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
        log_dens = kde.score_samples(X_plot)
        ax.plot(X_plot[:, 0], np.exp(log_dens), '-',
                label="kernel = '{0}'".format(kernel))

    ax.text(6, 0.38, "N={0} points".format(N))

    ax.legend(loc='upper left')
    ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

    ax.set_xlim(-4, 9)
    ax.set_ylim(-0.02, 0.4)
    plt.show()
def KDE(lst, dum):
    result = []
    lst2 = []
    lst.sort()
    if len(lst) > 1:
        for i in range(len(lst)):
            f, w = math.modf(lst[i])
            lst2.append(round(w + (f / 0.6), 2))
        a = array(lst2).reshape(-1, 1)
        kde = KernelDensity(kernel='gaussian', bandwidth=0.45).fit(a)
        s = linspace(0, 24)
        e = kde.score_samples(s.reshape(-1, 1))
        mi = argrelextrema(e, np.less)[0]
        mi = s[mi]
        if (len(mi) > 0):
            for k in range(len(mi) + 1):
                if k == 0:
                    result.append(list(filter(lambda i: i['t'] < mi[k], dum)))
                elif k == len(mi):
                    result.append(
                        list(filter(lambda i: i['t'] >= mi[k - 1], dum)))
                else:
                    result.append(
                        list(
                            filter(
                                lambda i: i['t'] >= mi[k - 1] and i['t'] < mi[
                                    k], dum)))
            return result
        else:
            return dum
    else:
        return 0
Beispiel #23
0
def entropy_Integral(X):
    '''
    Integral Estimate using summation
    '''
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X[:, None])
    logprob = kde.score_samples(X[:, None])
    return -1 * np.average(logprob, weights=np.exp(logprob))
def sklearn_kde_plot(dataframe, choose_choice, topic_name, fold_num):
    # print(dataframe)
    N = dataframe.values.size
    X = dataframe.values[:, np.newaxis]

    # X_plot = np.linspace(min(dataframe.values), max(dataframe.values), num=500)[:, np.newaxis]
    X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis]                                     # SET THISS
    # X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis]
    # print(min(dataframe.values))
    # print(max(dataframe.values))
    # print(dataframe)

    true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0]))
    fig, ax = plt.subplots()
    # ax.fill(X_plot, true_dens, fc='black', alpha=0.2, label='input distribution')

    # kde = KernelDensity(kernel='gaussian', bandwidth=0.005).fit(X)  # 'tophat', 'epanechnikov'
    kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(X)  # 'tophat', 'epanechnikov'              SET THISSSSSSSS
    log_dens = kde.score_samples(X_plot)
    ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format('gaussian'))

    ax.text(6, 0.38, "N={0} points".format(N))
    ax.legend(loc='upper right')
    # ax.plot(X[:, 0], -0.005 - 0.0005 * np.random.random(X.shape[0]), '+k')
    ax.plot(X[:, 0], -0.005 - 0.005 * np.random.random(X.shape[0]), '+k')

    # ax.set_xlim(min(dataframe.values), max(dataframe.values))
    ax.set_xlim(0, 10)                                                                                      # SET THISSSSSSSS
    # ax.set_ylim(-0.02, 1)
    ax.set_ylim(-0.02, 1.0)                                                                                 # SET THISSSSSSSS
    ax.set_xlabel("Delta Follower")
    ax.set_ylabel("Density")
    plt.title('Density - ' + choose_choice + ' (' + topic_name + ', ' + fold_num + ')')
    plt.show()
    return
Beispiel #25
0
 def density(self,
             x=None,
             kernel='gaussian',
             bandwidth=0.1,
             rtol=0.05,
             **kwargs):
     """Kernel density estimation.
     
     Parameters
     ----------
     x : ndarray
         Time points at which to evaluate density. If `x` is None, then 
         a KernelDensity object is returned.
     kernel : str
     bandwidth : scalar
         Kernel bandwidth
     rtol, **kwargs : extra arguments for sklearn.neighbor.kde.KernelDensity. 
     
     Returns
     -------
     ndarray or KernelDensity object
     
     """
     from sklearn.neighbors.kde import KernelDensity
     #create density function
     #TODO test speed of KernelDensity - roll our own?
     kde = KernelDensity(kernel=kernel,
                         bandwidth=bandwidth,
                         rtol=rtol,
                         **kwargs).fit(self._data[:, None])
     if x is not None:
         #evaluate density function
         x = np.array(x, copy=False)
         kde = np.exp(kde.score_samples(x[:, None]))
     return kde
Beispiel #26
0
    def get_highest_concentration(self, remove=False):
        '''Returns the point in the unassigned set with the highest estimated concentration 
        using the Gaussian KDE estimator.
        args:
            remove: Boolean to define if the point should be taken out of the unassigned list or not.
        returns:
            The point with the highest concentration.    
        '''
        temp_point_list = list(
            self._points)  #To preserve order going through kde estimation.

        lat_lng = {
            'lat': [p.lat for p in temp_point_list],
            'lng': [p.lng for p in temp_point_list]
        }

        lat_lng = pd.DataFrame(lat_lng)
        kde = KernelDensity(bandwidth=0.2, metric='haversine').fit(lat_lng)
        scored = kde.score_samples(lat_lng)
        lat_lng = lat_lng.assign(density=scored)
        highest = lat_lng['density'].idxmax()

        highest = temp_point_list[highest]

        if remove:
            self._points.remove(highest)

        return highest
Beispiel #27
0
def plot_kde(data, ax, settings):
    try:
        from sklearn.neighbors.kde import KernelDensity
    except:
        warnings.warn(
            "Cannot import sklearn.neighbors.kde. Cannot plot kernel density estimate."
        )
        return
    x = np.linspace(0, max(data), 200)
    if settings["plotting.kde_bandwidth"] is not None:
        bw = settings["plotting.kde_bandwidth"]
        kde = KernelDensity(kernel=settings["plotting.kde_kernel"],
                            bandwidth=bw).fit(data.values.reshape(-1, 1))

    else:
        grid = GridSearchCV(
            KernelDensity(kernel=settings["plotting.kde_kernel"]),
            {'bandwidth': np.linspace(math.radians(2), math.radians(30), 40)},
            cv=min(10, len(data)))  # 10-fold cross-validation
        try:
            grid.fit(data.values.reshape(-1, 1))
        except ValueError:
            return  #Do not plot kde, if we do not have enough datapoints
        #print("Bandwidth = {}".format(grid.best_params_))
        kde = grid.best_estimator_
    ax.plot(x,
            np.exp(kde.score_samples(x.reshape(-1, 1))),
            label="kde",
            linewidth=settings["plotting.kde_linewidth"],
            color=settings["plotting.kde_color"])
def KDE(lst):
    result = []
    lst2 = []
    lst.sort()
    if len(lst) > 1:
        for i in range(len(lst)):
            f, w = math.modf(lst[i])
            lst2.append(round(w + (f / 0.6), 2))
        a = array(lst2).reshape(-1, 1)
        kde = KernelDensity(kernel='gaussian', bandwidth=0.45).fit(a)
        s = linspace(0, 24)
        e = kde.score_samples(s.reshape(-1, 1))
        mi = argrelextrema(e, np.less)[0]
        if (len(mi) > 0):
            for i in range(len(mi) + 1):
                if i == 0:
                    temp = a[a < s[mi[i]]]
                elif i == len(mi):
                    temp = a[a >= s[mi[i - 1]]]
                else:
                    temp = a[(a >= s[mi[i - 1]]) * (a <= s[mi[i]])]
                if (len(temp) > 1):
                    for i in range(len(temp)):
                        f, w = math.modf(temp[i])
                        temp[i] = round(w + (f * 0.6), 2)
                    result.append(temp)
            print(result)
        else:
            print(lst)
    else:
        print('no')
def cluster_density(coords, gridcoords, bw = 0.004, CV_bw=False):
    """Compute KDE and return scores of grid points

   Parameters:
    ------------
        coords: np.array (n, 3)
            coordinates of points in cluster(!)
        gridcoords: np.array (n, 3)
            coordinates of grid, in which the density must be mapped
        bw: float
            bandwidth of Gaussian kernel, now hand tuned, depends on rectangular grid resolution.
            bandwidth also depends on user; how important are 'lonely' neurons in the clusters?

    Returns:
    -----------
        den
            scores of grid coords
    """
    if CV_bw:
        grid = GridSearchCV(KernelDensity(),
                            {'bandwidth': np.linspace(0.005, 0.03, 11)}, cv=25, verbose=0)
        grid.fit(coords)
        bw = grid.best_params_['bandwidth']
        print(f'Best bandwidth {bw}')
        return grid

    else:
        kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(coords)
    #    den = np.exp(kde.score_samples(refcoords))  # return scores
        den = kde.score_samples(gridcoords)  # return log scores
        return den
def KL_div(approximate_target_samples,
           target_dist_parameters,
           true_target='Gamma'):
    """ This function calculates the approximate KL divergence between 
    a true density function and an approximate one.
    The approximation is done via Gaussian KDE.
    """
    #    import pdb; pdb.set_trace()
    if true_target == 'Gamma':
        true_target_pdf = st.gamma.pdf(approximate_target_samples,
                                       a=target_dist_parameters[0],
                                       scale=1.0 / target_dist_parameters[1])
    elif true_target == 'Beta':
        true_target_pdf = st.beta.pdf(approximate_target_samples,
                                      a=target_dist_parameters[0],
                                      b=target_dist_parameters[1])

    # Now approximate the approximate_target_pdf

    T1_samples_for_kde = approximate_target_samples[:, np.newaxis]
    T1_kde = KernelDensity(kernel='gaussian').fit(T1_samples_for_kde)
    log_density = T1_kde.score_samples(T1_samples_for_kde)
    approximate_target_pdf = np.exp(log_density)

    # If the true_target_pdf has any 0 entries, the entropy will blow up
    # Take these samples out
    if np.any(true_target_pdf == 0):
        print('PDF of 0 detected at ' +
              str(len(np.where(true_target_pdf == 0))) + ' points.')
        print('Removing those points.')

    return entropy(approximate_target_pdf[true_target_pdf > 0],
                   true_target_pdf[true_target_pdf > 0])
Beispiel #31
0
    def clustering(self):
        """
        Create clusters by movie scores
        """
        # kernel density estimation
        values = self.df['score'].values.reshape(-1, 1)
        kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(values)

        # find cluster min-max points
        s = np.linspace(650, 18000)
        e = kde.score_samples(s.reshape(-1, 1))
        mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0]

        # concat min-max points
        points = np.concatenate((s[mi], s[ma]), axis=0)
        buckets = []

        for point in points:
            buckets.append(point)

        buckets = np.array(buckets)
        buckets.sort()

        # assign clusters
        self.df.loc[:, 'cluster'] = buckets.searchsorted(self.df.score)
Beispiel #32
0
def infer_from_contig2(df, contigs, contig_id, K=100000, K0=3000):
    # generate global KDE estimation
    C = df[(df['X1']==contig_id) & (df['X2']==contig_id)]
    inter = np.abs(C['P1'].values - C['P2'].values)
    kde = KernelDensity(kernel='gaussian', bandwidth=200).fit(inter.reshape(-1, 1))
    f = lambda x: kde.score_samples(x.reshape(-1, 1))

    # distant
    x1 = np.logspace(np.log10(K0), np.log10(K), 500)
    p = lambda x, a, b: a + b * np.log(x)
    param1, cov = curve_fit(p, x1, f(x1))

    # proximal
    degree = 30
    x0 = np.logspace(0, np.log10(K0), 500)
    param0 = np.polyfit(x0, f(x0), degree)

    P = (lambda x: np.where( \
            x < K0, \
            np.poly1d(param0)(x), \
            np.where(x < K, param1[0] + param1[1] * np.log(x), param1[0] + param1[1] * np.log(K)) \
            ))

    # P = (lambda x: np.where( \
    #         x < K0, \
    #         param1[0] + param1[1] * np.log(K0), \
    #         np.where(x < K, param1[0] + param1[1] * np.log(x), param1[0] + param1[1] * np.log(K)) \
    #         ))

    return P, f
Beispiel #33
0
def KDE_tri(pred_point_cloud, bbox, text):
	''' use KDE to filter outliers in predict point cloud '''
	(h0, h1, w0, w1, d0, d1) = bbox
	X,Y,Z = np.mgrid[h0:h1, w0:w1, d0:d1]
	positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()])

	# 1. KDE
	start = time.time()
	kde = KernelDensity(kernel='epanechnikov', bandwidth=KDE_bandwidth).fit(pred_point_cloud.T)
	score = kde.score_samples(positions.T)
	score = score.reshape(h1-h0, w1-w0, d1-d0)
	filtered_pred_point_cloud = np.where(score > KDE_log_prob_th)
	points_list = [filtered_pred_point_cloud[0] + h0, 
				   filtered_pred_point_cloud[1] + w0, 
				   filtered_pred_point_cloud[2] + d0]
	print('KDE filter done', time.time() - start)
	print('filtered_pred_point_cloud (', filtered_pred_point_cloud[0].shape[0], '* 3 )')
	text.write('filtered_pred_point_cloud: ' + str(filtered_pred_point_cloud[0].shape[0]) + ' * 3 \n')
	text.flush()

	# 2. Delaunay triangulation
	start = time.time()
	points = np.asarray(points_list).T
	tri = Delaunay(points)
	print('Delaunay triangulation done', time.time() - start)
	return points, tri
def kdewrap(indata, kernel):
    grid = GridSearchCV(KernelDensity(),
                    {'bandwidth': np.linspace(0.1, 1.0, 30)},
                    cv=10) # 10-fold cross-validation
    grid.fit(indata[:, None])
    kde = KernelDensity(kernel=kernel, bandwidth=grid.best_params_["bandwidth"]).fit(indata[:, np.newaxis])
    return kde.score_samples(indata[:, np.newaxis])
class RegularizedKernelDensityEstimator(BaseEstimator):
    def __init__(self, bandwidth=1.0, regularization=1.0e-5):
        self.bandwidth = bandwidth
        self.regularization = regularization

    def setup(self):
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)

        height, width = self.shape
        self.uniform_density = -np.log(width * height)

        self.kde_constant = np.log(1 - self.regularization)
        self.uniform_constant = np.log(self.regularization)

    def fit(self, X):
        self.shape = X[0, 2:4]
        self.setup()
        self.kde.fit(X[:, 0:2])
        return self

    def score_samples(self, X):
        kde_logliks = self.kde.score_samples(X[:, :2])

        logliks = np.logaddexp(self.kde_constant + kde_logliks,
                               self.uniform_constant + self.uniform_density)
        return logliks

    def score(self, X):
        return np.sum(self.score_samples(X))
Beispiel #36
0
def simplify3(nk):
	result=[]
	nk=np.array(nk)
	xk = nk/float(np.sum(nk))
	#print nk
	
	#X_plot = np.linspace(0, len(nk), 1000)[:, np.newaxis]
	sdiv=1000
	X_plot = np.linspace(0, len(xk), sdiv)[:, np.newaxis]
	custm = stats.rv_discrete(name='custm',a=0,b=7, values=(range(len(xk)), xk))
	yk= custm.rvs(size=100000)
	#yk.flatten()
	#fig, ax = plt.subplots(1, 1)
	#ax.hist(yk, normed=True, histtype='stepfilled', alpha=0.2)
	# gaussian KDE
	X=yk.reshape(-1, 1)
	kde = KernelDensity(kernel='gaussian', bandwidth=0.6).fit(X)
	log_dens = kde.score_samples(X_plot)
	mi, ma = argrelextrema(log_dens, np.less)[0], argrelextrema(log_dens, np.greater)[0]
	mi=np.rint(mi*float(len(xk))/float(sdiv))
	ma=np.rint(ma*float(len(xk))/float(sdiv))
	start=0	
	#print mi
	for i in mi:
		i=int(i)
		if start!=i:
			val=np.average(nk[start:i])
			for j in xrange(start,i):
				result.append(val)
		start=i	
	val=np.average(nk[start:])
	for j in xrange(start,len(nk)):
			result.append(val)
	return np.array(result)
Beispiel #37
0
def projected_density_gauss(pos, fov, ncells):
    """
    Input:
        pos: particle positions
        mass: particle masses
        centre: centre of sub-&halo
        fov: field-of-view
        ncells: number of grid cells
    """
    pos = pos - centre
    
    _indx = np.logical_and(np.abs(pos[:, 0]) < 0.5*fov,
                           np.abs(pos[:, 1]) < 0.5*fov)
    pos = pos[_indx, :]
    n = 1024*1024
    h = (4*np.std(pos[:, :2])**5/(3*n))**(1/5)
    #TODO: plot this falty situation
    kde_skl = KernelDensity(bandwidth=h,
                            kernel='gaussian',
                            algorithm='ball_tree')
    
    xx, yy = np.mgrid[min(pos[:, 0]):max(pos[:, 0]):complex(ncells), 
                      min(pos[:, 1]):max(pos[:, 1]):complex(ncells)]

    xy_sample = np.vstack([xx.ravel(), yy.ravel()]).T

    kde_skl.fit(pos[:, :2])
    sigma = np.exp(kde_skl.score_samples(xy_sample))
    sigma = sigma.reshape(xx.shape)
    return sigma, h
Beispiel #38
0
def pdf(data: list):
    # hist, bin = np.histogram(data, bins=50)
    # return hist
    kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit([[x] for x in data])
    b = [[x] for x in np.linspace(min(data), max(data), 100)]
    a = np.exp(kde.score_samples(b))

    return a
Beispiel #39
0
def find_max_density_point(point_list):
    point_list, _ = remove_nan(point_list)
    if point_list.shape[0] == 0:
        return [float('nan'),float('nan'),float('nan')]
    kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(point_list)
    prob_list = kde.score_samples(point_list)
    max_point = point_list[np.argmax(prob_list)]
    # print "max", max_point
    return max_point
def createfeatmat(N):
    grid = getgridcoords(N).T
    featmat = np.zeros((len(vals), N ** 2))
    for i in range(len(vals)):
        m = np.array([vals[i][0], vals[i][1]]).T
        k = KernelDensity(bandwidth=0.5 / (N - 1), kernel="gaussian")
        k.fit(m)
        featmat[i, :] = k.score_samples(grid)
    return featmat
Beispiel #41
0
def histLine(axes, data, minmax, color):
	(xmin, xmax) = minmax
	data = data.reshape(-1, 1)
	kde = KernelDensity(bandwidth=(xmax-xmin)/100.0).fit(data)
	x = np.linspace(xmin, xmax, 100).reshape(-1, 1)
	foo = kde.score_samples(x)
	density = np.exp(foo)

	axes.plot(x, density, color=color)
def estimate_distribution(samples, h=0.1, n_points=100):
	kde = KernelDensity(bandwidth=h)
	min_xs = min(samples)
	max_xs = max(samples)
	samples = samples[:, np.newaxis]
	kde.fit(samples)
	xs = np.linspace(min_xs, max_xs, n_points)
	ys = np.exp(kde.score_samples(xs[:, np.newaxis]))
	print xs.shape, ys.shape, sum(ys)
	return xs, ys
Beispiel #43
0
class OneClassKDE(BaseClassifier):
    _fit_params = ["bandwidth"]
    def __init__(self, *args, **kwargs):
        self.bandwidth = kwargs["bandwidth"]
    
    def fit(self, data, **kwargs):
        #self.train_data = data
        self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth)
        self.kde.fit(data)
        self.training_score = self.kde.score_samples(data)
        self.direct_thresh = numpy.percentile(self.training_score, 10)
    
    def predict(self, data):
        score = self.kde.score_samples(data)
        self.score = score
        return (score < self.direct_thresh).astype(numpy.int32)*-2+1
    
    def decision_function(self, data):
        return self.score
Beispiel #44
0
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    
    N = np.trapz(np.exp(log_pdf), x_grid)

    return np.exp(log_pdf)/N
    def fit(self, X, y):
        a = np.zeros((24, 7))
        hours = np.copy(X[:, 1])
        weekdays = np.copy(X[:, 2])
        hours = 23 * normalize(hours)
        weekdays = 6 * normalize(weekdays)

        if self.strategy == 'mean':
            counts = a.copy()
            for i, row in enumerate(zip(hours, weekdays)):
                hour = int(row[0])
                day = int(row[1])
                counts[hour, day] += 1
                a[hour, day] += y[i]

            counts[counts == 0] = 1
            self._model = a / counts

        elif self.strategy in ('median', 'kernel'):

            # this is a 3d array 
            groups = [[[] for i in range(7)] for j in range(24)]

            for i, row in enumerate(zip(hours, weekdays)):
                hour = int(row[0])
                day = int(row[1])
                groups[hour][day].append(y[i])

            if self.strategy == 'median':
                for i, j in np.ndindex((24, 7)):
                    if groups[i][j]:
                        a[i,j] = np.median(groups[i][j])
                    else:
                        a[i,j] = np.nan
            elif self.strategy == 'kernel':
                # kernel method computes a kernel density for each of the
                # bins and determines the most probably value ('mode' of sorts)
                grid = np.linspace(np.min(y), np.max(y), 1000)[:, np.newaxis]
                for i, j in np.ndindex((24, 7)):
                    if groups[i][j]:
                        npgroups = np.array(groups[i][j])[np.newaxis]
                        kernel = KernelDensity(kernel='gaussian', \
                                                bandwidth=0.2).fit(npgroups.T)
                        density = kernel.score_samples(grid)
                        dmax = np.max(density)
                        imax = np.where(density==dmax)
                        a[i,j] = grid[imax, 0]
                    else:
                        a[i,j] = np.nan

            self._model = a

        # smooth the model here if there are nans
        return self
Beispiel #46
0
def densityEst(a,x,p,knn=1,Mode='G'):
	""" This is a density estimation currently supporting
	 one-dimensional Data.
	 There are two modes of operation:
	 knn==0 (Default) use fixed bandwidth.
	 knn==1 use k nearest neigbors.
	 Tow types of kernel are supported:
	 Mode=='T' (Default) for triangular.
	 Mode=='G' for Gaussian.
	 a is a vector of samples.
	 p is the parameter of model (bandwidth when knn=0 of number of neighbors
	 otherwise.
	 x is  points of estimation
	"""
	N=len(x)
	x.resize(N,1)
	l=len(a)
	a=num.array(a)
	a.resize(l,1)
	if knn==0:
		try:
			from sklearn.neighbors.kde import KernelDensity
		except ImportError:
			print 'Error:Please install sklearn package...'
			return
		if Mode=='T':
			S='linear'
		elif Mode=='G':
			S='gaussian'
		else:
			print 'Currently only G(gaussian) and T(triangular) Modes are supported'
			return
		kde = KernelDensity(kernel=S, bandwidth=p).fit(a)
		return (x,num.exp(kde.score_samples(x)))
	elif knn==1:
		try:
			from sklearn.neighbors import NearestNeighbors
		except ImportError:
			print 'Error:Please install sklearn package...'
			return
		neigh = NearestNeighbors(n_neighbors=p)
		neigh.fit(a)
		dist,index=neigh.kneighbors(x)
		H=dist[:,-1]
		est=[0.0]*N
		for i,point_v in enumerate(x):
			point=point_v[0]
			h=H[i]
			est[i]=sum(kernel((a-point)/h,Mode))/(l*h)
		return (x,est)
	else:
		print 'knn must be 0 or 1'
		return
def kernel_pmi_func(df, x, y, i, b=1.0):
    x = np.array(df[x])
    y = np.array(df[y])
    x_y = np.stack((x, y), axis=-1)
    
    kde_x = KernelDensity(kernel='gaussian', bandwidth=b).fit(x[:, np.newaxis])
    kde_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(y[:, np.newaxis])
    kde_x_y = KernelDensity(kernel='gaussian', bandwidth=b).fit(x_y)
    
    p_x = np.exp(kde_x.score_samples(x[:, np.newaxis]))
    p_y = np.exp(kde_y.score_samples(y[:, np.newaxis]))
    p_x_y = np.exp(kde_x_y.score_samples(x_y))
    
    # df['PMI_'+str(i)] = np.log( p_x_y / (p_x * p_y) )

    # print "len p_x", len(p_x), "len p_y", len(p_y), "len p x y", len(p_x_y)

    # return df
    vals = np.log(p_x_y / (p_x * p_y))
    # print vals[1]
    return vals
def plot_stan_trc(dftrc):
    """
       Create simple plots of parameter distributions and traces from 
       output of pystan sampling. Emulates pymc traceplots.
    """

    fig, ax2d = plt.subplots(nrows=dftrc.shape[1], ncols=2, figsize=(14, 1.8*dftrc.shape[1]),
                                facecolor='0.99', edgecolor='k')
    fig.suptitle('Distributions and traceplots for {} samples'.format(
                                dftrc.shape[0]),fontsize=14)
    fig.subplots_adjust(wspace=0.2, hspace=0.5)

    k = 0
    
    # create density and traceplot, per parameter coeff
    for i, (ax1d, col) in enumerate(zip(ax2d, dftrc.columns)):

        samples = dftrc[col].values
        scale = (10**np.round(np.log10(samples.max() - samples.min()))) / 20
        kde = KernelDensity(bandwidth=scale).fit(samples.reshape(-1, 1))
        x = np.linspace(samples.min(), samples.max(), 100).reshape(-1, 1)
        y = np.exp(kde.score_samples(x))
        clr = sns.color_palette()[0]

        # density plot
        ax1d[0].plot(x, y, color=clr, linewidth=1.4)
        ax1d[0].vlines(np.percentile(samples, [2.5, 97.5]), ymin=0, ymax=y.max()*1.1,
                       alpha=1, linestyles='dotted', colors=clr, linewidth=1.2)
        mn = np.mean(samples)
        ax1d[0].vlines(mn, ymin=0, ymax=y.max()*1.1,
                       alpha=1, colors='r', linewidth=1.2)
        ax1d[0].annotate('{:.2f}'.format(mn), xy=(mn,0), xycoords='data'
                    ,xytext=(5,10), textcoords='offset points', rotation=90
                    ,va='bottom', fontsize='large', color='#AA0022')    
        ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':10})


        # traceplot
        ax1d[1].plot(np.arange(len(samples)),samples, alpha=0.2, color=clr, linestyle='solid'
                              ,marker=',', markerfacecolor=clr, markersize=10)
        ax1d[1].hlines(np.percentile(samples,[2.5, 97.5]), xmin=0, xmax=len(samples),
                       alpha=1, linestyles='dotted', colors=clr)
        ax1d[1].hlines(np.mean(samples), xmin=0, xmax=len(samples), alpha=1, colors='r')

        k += 1
                
        ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':14})#,'fontweight':'bold'})
        #ax1d[0].legend(loc='best', shadow=True)
        
        _ = [ax1d[j].axes.grid(True, linestyle='-', color='lightgrey') for j in range(2)]
            
    plt.subplots_adjust(top=0.94)
    plt.show()
def chart_by_time():
  weekday_amrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [7,8,9]]
  weekday_pmrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [17,18,19]]
  weekday_midday = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [10,11,12,13,14,15,16]]
  weekday_night = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [20,21,22,23,0,1,2,3,4,5,6]]
  weekend = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]]

  weekday_amrush_avg = sum(weekday_amrush) / float(len(weekday_amrush))
  weekday_pmrush_avg = sum(weekday_pmrush) / float(len(weekday_pmrush))
  weekday_midday_avg = sum(weekday_midday) / float(len(weekday_midday))
  weekday_night_avg = sum(weekday_night) / float(len(weekday_night))
  weekend_avg = sum(weekend) / float(len(weekend))


  print("weekday_amrush_avg: ", weekday_amrush_avg,
        "weekday_pmrush_avg: ", weekday_pmrush_avg,
        "weekday_midday_avg: ", weekday_midday_avg,
        "weekday_night_avg: ", weekday_night_avg,
        "weekend_avg: ", weekend_avg)

  x = np.linspace(min(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), max(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), 100).reshape(-1, 1)
  kde_weekday_amrush = KernelDensity(bandwidth=70).fit(np.array(weekday_amrush).reshape(-1, 1))
  density_weekday_amrush = np.exp(kde_weekday_amrush.score_samples(x))
  kde_weekday_pmrush = KernelDensity(bandwidth=70).fit(np.array(weekday_pmrush).reshape(-1, 1))
  density_weekday_pmrush = np.exp(kde_weekday_pmrush.score_samples(x))
  kde_weekday_midday = KernelDensity(bandwidth=70).fit(np.array(weekday_midday).reshape(-1, 1))
  density_weekday_midday = np.exp(kde_weekday_midday.score_samples(x))
  kde_weekday_night = KernelDensity(bandwidth=70).fit(np.array(weekday_night).reshape(-1, 1))
  density_weekday_night = np.exp(kde_weekday_night.score_samples(x))
  kde_weekend = KernelDensity(bandwidth=70).fit(np.array(weekend).reshape(-1, 1))
  density_weekend = np.exp(kde_weekend.score_samples(x))

  plt.plot(x, density_weekday_amrush, 'r')
  plt.plot(x, density_weekday_pmrush, 'y')
  plt.plot(x, density_weekday_midday, 'g')
  plt.plot(x, density_weekday_night, 'b')
  plt.plot(x, density_weekend, 'm')
  plt.xlabel("Time start to endpoint")
  plt.ylabel("Density")
  plt.show()
Beispiel #50
0
def pda_single(synth_data, data, bandwidth=.1):
    #synth_data = np.log(np.abs(synth_data))[:, np.newaxis]
    #data_log = np.log(np.abs(data))[:, np.newaxis]
    synth_data = synth_data[:, np.newaxis]
    data = data[:, np.newaxis]
    if bandwidth == 'silverman':
        lower, upper = scoreatpercentile(synth_data, [25, 75])
        iqr = upper - lower
        sd = np.std(synth_data)
        bandwidth = .9 * min(sd, iqr/1.34) * len(data)**(-1./5)

    kde = KernelDensity(kernel='epanechnikov', bandwidth=bandwidth).fit(synth_data)
    return kde.score_samples(data)
def chart_by_day():
  #
  # On average, trips on the weekend take less time than trips on weekdays
  # 1337 sec versus 1446 sec
  # 
  weekend_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]]
  weekday_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6]]
  weekend = sum(weekend_times) / float(len(weekend_times))
  weekday = sum(weekday_times) / float(len(weekday_times))
  print("weekend: ", weekend, "weekday: ", weekday)
  x = np.linspace(min(weekend_times + weekday_times), max(weekend_times + weekday_times), 100).reshape(-1, 1)

  kde_weekend = KernelDensity(bandwidth=100).fit(np.array(weekend_times).reshape(-1, 1))
  density_weekend = np.exp(kde_weekend.score_samples(x))

  kde_weekday = KernelDensity(bandwidth=100).fit(np.array(weekday_times).reshape(-1, 1))
  density_weekday = np.exp(kde_weekday.score_samples(x))

  plt.plot(x, density_weekend, 'r')
  plt.plot(x, density_weekday, 'b')
  plt.xlabel("Time start to Grand Ave: red: weekend, blue, weekday")
  plt.ylabel("Density")
  plt.show()
def find_centroid(data, bandwidth=0.003, iter_num=6, halfwidth=0.02):
	kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data)
	grid = 10
	position = np.array([0,0])
	#halfwidth = 0.02
	for i in range(iter_num):
		low = position-halfwidth
		high = position+halfwidth
		X, Y = np.mgrid[low[0]:high[0]:20j, low[1]:high[1]:20j]
		positions = np.vstack([X.ravel(), Y.ravel()]).T
		img = kde.score_samples(positions)
		position = positions[np.argmax(img)]
		halfwidth = halfwidth*2./(grid-1.)
	return position
Beispiel #53
0
def test_density_plot():
    fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
    
    N=20
    X = np.concatenate((np.random.normal(0, 1, 0.3 * N),
                        np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis]
                        
    print np.shape(X)
    X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]   
    print np.shape(X_plot)
    kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax[0,0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
    ax[0,0].text(-3.5, 0.31, "Gaussian Kernel Density")
    ax[0,0].plot(X[:, 0], np.zeros(X.shape[0]) - 0.01, '+k')
    
    plt.show()
Beispiel #54
0
def makeKDE(m):

	m = m[(m>-100) & (m<100)]
	m = m[:, np.newaxis] # Training data
	l = len(m)
	sigma = np.std(m)
	kdebw = (1.*4/3*sigma**5/ l)**(1./5.)
	
	try:
		X_plot = np.linspace(rng[0], rng[1], 1000)[:, np.newaxis]
		kde = KernelDensity(kernel='gaussian', bandwidth=kdebw).fit(m)
		log_dens = kde.score_samples(X_plot)
		log_dens_exp = np.exp(log_dens)
		KDE_mag = np.float(X_plot[np.argmax(log_dens_exp)])
	except ValueError:
		log_dens_exp = np.ones(len(X_plot[:,0]))*-99.99
		KDE_mag, sigma = -99.99, -99.99
	return X_plot, log_dens_exp, KDE_mag, sigma
Beispiel #55
0
def plot_2d(i1, i2):
    #px = pca.components_[i1]
    #py = pca.components_[i2]
    #xlabel, ylabel = [], []
    #for i in xrange(len(px)):
    #    xlabel.append('%.2f %s' % (px[i], colnames[i]))
    #    ylabel.append('%.2f %s' % (py[i], colnames[i]))
    #ax = plt.axes()
    #ax.yaxis.set_label_coords(-0.05, 0.2)
    plt.clf()
    xy = np.vstack([output[:, i1], output[:, i2]]).T
    kde = KernelDensity(kernel='tophat', bandwidth=0.01, leaf_size=10).fit(xy)
    z = kde.score_samples(xy)
    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = output[idx, i1], output[idx, i2], z[idx]
    plt.xlabel('Component %i' % i1)
    plt.ylabel('Component %i' % i2)
    plt.scatter(x, y, c=z, s=10, edgecolor='')
    plt.savefig('ML_data/%s_pca_%s_%s.png' % (name, i1, i2))
Beispiel #56
0
def simplify_data2(x,y,size):
	avg=[]
	result=[]
	kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(x)
	s = np.linspace(0,size,len(x))
	e = kde.score_samples(s.reshape(-1,1))
	mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0]
	start=0	
	for i in mi:
		val=np.average(x[start:i])
		for j in xrange(start,i):
			result.append(val)
		start=i	
	val=np.average(x[start:])
	for j in xrange(start,len(x)):
			result.append(val)
	#plt.plot(s, e*0.01+e[mi[0]])
	print mi
	print ma
	plt.plot(s,x.reshape(1,-1)[0])
	plt.plot(s,result)
	#print x, len(x)
	plt.show()
Beispiel #57
0
def doKDEBasedPlot(dataSamples,targetName,featureName,doSave):
	#doSave = True
	if(doSkipDraw):
		return
	doSave = doSaveGlobal
	tagetJustName = ""
        fields = targetName.split(":")
        targetJustName = fields[len(fields) - 1]
        isPapi = targetJustName.find("PAPI_")
        if(isPapi != -1):
                targetJustName = targetJustName[5:]

	featureJustName = ""
        fields = featureName.split(":")
        featureJustName = fields[len(fields) - 1]
        featureName = featureJustName

        appName = ""
        #appName = "Linpack"
        #appName = "Matrix Multiplication"
        #appName = "Sparse Matrix Vector Multiplication"
        #appName = "Black Scholes"
        appName = "FFmpeg"
        #appName = "PageRank"
        #appName = "PageRank"

	if(os.getenv("APPNAME") != None):
                appName = os.getenv("APPNAME")
        print "\nappName = " , appName , "\n"

        dumpDir = codeDir + "/gold_histograms/"
        if(appName == "Sparse Matrix Vector Multiplication"):
                dumpDir = dumpDir + "SPARSE_MATRIX_MUL"
        if(appName == "Linpack"):
                dumpDir = dumpDir + "LINPACK"
        if(appName == "Matrix Multiplication"):
                dumpDir = dumpDir + "MATRIX_MUL"
        if(appName == "FFmpeg"):
                dumpDir = dumpDir + "FFmpeg"
        if(appName == "PageRank"):
                dumpDir = dumpDir + "PAGE_RANK"

	if(appName == "Black Scholes"):
                dumpDir = dumpDir + "BLACKSCHOLES"

	if(appName == "LULESH"):
                dumpDir = dumpDir + "LULESH"

	if(appName == "CoMD"):
                dumpDir = dumpDir + "CoMD"

	if(appName == "Sparse Matrix Vector Multiplication"):
                dataSamples = [x / 10.0 for x in dataSamples]

        fileName1 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_kde.png"
        fileName2 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_tight_kde.png"
        fileName3 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_kde.eps"
        fileName4 = "errHisto_" + getGlobalObject("baseModuleName") + ":" + targetName + "_" + featureName + "_tight_kde.eps"
        saveFileName1 = os.path.join(dumpDir,fileName1)
        saveFileName2 = os.path.join(dumpDir,fileName2)
        saveFileName3 = os.path.join(dumpDir,fileName3)
        saveFileName4 = os.path.join(dumpDir,fileName4)

        #ttl = "Error histogram: \n" + targetName + " for " + featureName 
        #ttl = appName + ": " + getGlobalObject("baseModuleName") + " - Observation: " + targetJustName
	ttl = getGlobalObject("baseModuleName") + " - Obs:" + targetJustName

	dataSamples = np.array(dataSamples)
	#grid = GridSearchCV(KernelDensity(kernel='gaussian'),{'bandwidth': np.linspace(0.1, 1.0, 5)},cv=10) # 20-fold cross-validation
	#fitScore = grid.fit(dataSamples[:, None])
	#print grid.best_params_, " fit score : " , fitScore
	
	#kde = grid.best_estimator_
	kde =  KernelDensity(kernel='gaussian', bandwidth=0.17).fit(dataSamples[:, None])
	maxVal = max(dataSamples)
        minVal = min(dataSamples)
	#a = maxVal - 0.02
	#b = maxVal + 0.01
	#print "log-prob at 0 : ", kde.score(0)
	#print "log-prob at", a, " : ", kde.score(a)
	#print "log-prob at", b, " : ", kde.score(b)
    
	#x_grid = np.linspace(minVal, maxVal, 1000)
	x_grid = np.linspace(-4.0, 4.0, 5000)
	pdf = kde.score_samples(x_grid[:, None])
	#pdf = np.exp(kde.score_samples(x_grid[:, None]))
	
	#pdf = np.exp(kde.score_samples(dataSamples[:, None]))

	fig, ax = plt.subplots()
	fig1 = plt.figure(frameon=False)
	#ax.plot(x_grid, np.exp(pdf), linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
	#ax.plot(x_grid, np.exp(pdf), linewidth=3, color='b')
	#ax.fill(x_grid, np.exp(pdf), fc='g',alpha=0.75)
	ax.fill(x_grid, pdf, fc='g',alpha=0.75)
	#ax.plot(dataSamples, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
	#ax.hist(dataSamples, 30, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
	#ax.legend(loc='upper left')
	plt.title(ttl,fontsize=20)
        plt.xlabel('% Error in prediction',fontsize=20)
        plt.ylabel('Density',fontsize=20)
	#plt.xticks(np.arange(min(x), max(x)+1, 20.0))
	plt.yticks(np.linspace(y[0], y[len(y)-2], 5))
	plt.xticks(fontsize=14)
	plt.yticks(fontsize=14)

	#doSave = False
        if(doSave == False):
                plt.show()
                #plt.savefig(saveFileName)
                #plt.savefig(saveFileName2, bbox_inches='tight')
                #plt.savefig(saveFileName3, bbox_inches='tight')
                #plt.close()
        else:
		print "SaveFileName = " + saveFileName1
                #plt.savefig(saveFileName, bbox_inches='tight')
                plt.savefig(saveFileName1, format='png',dpi=800)
                plt.savefig(saveFileName2, format='png',dpi=800,bbox_inches='tight')
                plt.savefig(saveFileName3, format='eps',dpi=800)
                plt.savefig(saveFileName4, format='eps',dpi=800,bbox_inches='tight')
                plt.close()
#PCA
pca = PCA(n_components=20)
pca.fit(msa_vectors[1000:])
a_samps_pca = pca.transform(msa_vectors[1000:])
b_samps_pca = pca.transform(msa_vectors[:1000])
print a_samps_pca.shape

#KDE
# for bw in [.01, .1, 1., 10.]:
for bw in [ 1.]:

	kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(a_samps_pca)
	# density_train = kde.score_samples(msa_vectors)
	print bw, kde.score(b_samps_pca)

densities = kde.score_samples(b_samps_pca)
# densities = np.ones(1000)

#Scale densities to betw 0 and 1
min_density = np.min(densities)
densities = densities - min_density + 1.

weights = np.reciprocal(densities)

max_weights = np.max(weights)
weights = weights / max_weights

print np.max(weights)
print np.mean(weights)
print np.min(weights)