def simplify3(nk): result=[] nk=np.array(nk) xk = nk/float(np.sum(nk)) #print nk #X_plot = np.linspace(0, len(nk), 1000)[:, np.newaxis] sdiv=1000 X_plot = np.linspace(0, len(xk), sdiv)[:, np.newaxis] custm = stats.rv_discrete(name='custm',a=0,b=7, values=(range(len(xk)), xk)) yk= custm.rvs(size=100000) #yk.flatten() #fig, ax = plt.subplots(1, 1) #ax.hist(yk, normed=True, histtype='stepfilled', alpha=0.2) # gaussian KDE X=yk.reshape(-1, 1) kde = KernelDensity(kernel='gaussian', bandwidth=0.6).fit(X) log_dens = kde.score_samples(X_plot) mi, ma = argrelextrema(log_dens, np.less)[0], argrelextrema(log_dens, np.greater)[0] mi=np.rint(mi*float(len(xk))/float(sdiv)) ma=np.rint(ma*float(len(xk))/float(sdiv)) start=0 #print mi for i in mi: i=int(i) if start!=i: val=np.average(nk[start:i]) for j in xrange(start,i): result.append(val) start=i val=np.average(nk[start:]) for j in xrange(start,len(nk)): result.append(val) return np.array(result)
def xy_kde(xy,bandwidth,N_grid=100,levels=[0.8,0.6,0.4,0.2]): x_edges = np.linspace(np.min(xy[:,0]),np.max(xy[:,0]),N_grid+1) y_edges = np.linspace(np.min(xy[:,1]),np.max(xy[:,1]),N_grid+1) x_centres = np.array([x_edges[b] + (x_edges[b+1]-x_edges[b])/2 for b in range(N_grid)]) y_centres = np.array([y_edges[b] + (y_edges[b+1]-y_edges[b])/2 for b in range(N_grid)]) x_grid, y_grid = np.meshgrid(x_centres,y_centres) xy_grid = np.array([np.ravel(x_grid),np.ravel(y_grid)]).T kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(xy) H = np.exp(kde.score_samples(xy_grid).reshape(N_grid,N_grid)) # this bit is taken from the corner_plot.py method. ###################################### Hflat = H.flatten() inds = np.argsort(Hflat)[::-1] Hflat = Hflat[inds] sm = np.cumsum(Hflat) sm /= sm[-1] V = np.empty(len(levels)) for i, v0 in enumerate(levels): try: V[i] = Hflat[sm <= v0][-1] except: V[i] = Hflat[0] ##################################### V = np.sort(V) return H, V, x_grid, y_grid
def kdewrap(indata, kernel): grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 1.0, 30)}, cv=10) # 10-fold cross-validation grid.fit(indata[:, None]) kde = KernelDensity(kernel=kernel, bandwidth=grid.best_params_["bandwidth"]).fit(indata[:, np.newaxis]) return kde.score_samples(indata[:, np.newaxis])
def _importance_preprocess_uni(states, rewards, gradients, p_tar, p_gen): res = _create_episode_info() flat_states = [s for traj in states for s in traj] # TODO Pass in as args? kde = KernelDensity(kernel='gaussian', bandwidth=0.25) kde.fit(flat_states) for ss, rs, gs, ps, qs in izip(states, rewards, gradients, p_tar, p_gen): state_probs = kde.score_samples(ss) traj_p = np.cumsum(ps) # + np.mean(state_probs) traj_q = np.cumsum(qs) + state_probs traj_grads = np.cumsum(gs, axis=0) r_acc = np.cumsum(rs[::-1])[::-1] r_grad = (r_acc * traj_grads.T).T res.r_grads.extend(r_grad) res.traj_p_tar.extend(traj_p) res.traj_p_gen.extend(traj_q) res.traj_grads.extend(traj_grads) res.traj_r.extend(r_acc) # Used for estimating fisher res.act_grads.extend(gs) res.state_act_p_tar.extend(traj_p) res.state_act_p_gen.extend(traj_q) return res
class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] _predict_params = [] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] self.perc_keep = kwargs["perc_keep"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) idx = numpy.random.randint(2, size=len(data)).astype(numpy.bool) print idx self.kde.fit(data[idx, :]) self.training_score = self.kde.score_samples(data[~idx, :]) self.direct_thresh = numpy.percentile(self.training_score, 100-self.perc_keep) print 'training', self.training_score.min(), self.training_score.mean(), self.training_score.max(), self.direct_thresh print self.direct_thresh def predict(self, data): score = self.kde.score_samples(data) self.score = score res = (score < self.direct_thresh) print 'test', self.score.min(), self.score.mean(), self.score.max() print res.sum(), "of", len(self.score), 'outliers' return res.astype(numpy.uint8)*-2+1 def decision_function(self, data=None): return self.score
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(-1.0, 1.0, n_points) ys = [np.exp(kde.score([x])) for x in xs] return xs, ys
def kernel_estimation(test,train_n,train_p): relevance_score=[] result_n=[] result_p=[] X_n=np.array(train_n) X_p=np.array(train_p) Y=np.array(test) #params = {'bandwidth': np.logspace(-1, 1, 20)} #grid = GridSearchCV(KernelDensity(), params) #grid.fit(X_n) #print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) kde_n = KernelDensity(kernel='gaussian', bandwidth=0.999).fit(X_n) kde_p = KernelDensity(kernel='gaussian', bandwidth=4.772).fit(X_p) for i in range(len(Y)): result_n.append(np.exp(float(str(kde_n.score_samples(Y[i])).replace('[','').replace(']','')))) result_p.append(np.exp(float(str(kde_p.score_samples(Y[i])).replace('[','').replace(']','')))) if i%1000==0: print i for i in range(len(result_n)): if result_n[i]==0.0: relevance_score.append(np.log(result_p[i]/1.8404e-17+1)) else: relevance_score.append(np.log(result_p[i]/result_n[i]+1)) return relevance_score
def sklearn_kde_plot(dataframe, choose_choice, topic_name, fold_num): # print(dataframe) N = dataframe.values.size X = dataframe.values[:, np.newaxis] # X_plot = np.linspace(min(dataframe.values), max(dataframe.values), num=500)[:, np.newaxis] X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis] # SET THISS # X_plot = np.linspace(min(dataframe.values), 10, num=500)[:, np.newaxis] # print(min(dataframe.values)) # print(max(dataframe.values)) # print(dataframe) true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) fig, ax = plt.subplots() # ax.fill(X_plot, true_dens, fc='black', alpha=0.2, label='input distribution') # kde = KernelDensity(kernel='gaussian', bandwidth=0.005).fit(X) # 'tophat', 'epanechnikov' kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(X) # 'tophat', 'epanechnikov' SET THISSSSSSSS log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format('gaussian')) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper right') # ax.plot(X[:, 0], -0.005 - 0.0005 * np.random.random(X.shape[0]), '+k') ax.plot(X[:, 0], -0.005 - 0.005 * np.random.random(X.shape[0]), '+k') # ax.set_xlim(min(dataframe.values), max(dataframe.values)) ax.set_xlim(0, 10) # SET THISSSSSSSS # ax.set_ylim(-0.02, 1) ax.set_ylim(-0.02, 1.0) # SET THISSSSSSSS ax.set_xlabel("Delta Follower") ax.set_ylabel("Density") plt.title('Density - ' + choose_choice + ' (' + topic_name + ', ' + fold_num + ')') plt.show() return
def basic_properties( sequences , axess=None, labl = None, logscale=[False], markr='.', clr='k',offset=0, alfa = 0.8, distir = [False,False,False, False], bandwidths = [3, 0.1,0.01,1], limits = [(1,50),(0,1),(0,1),(1,25)] ): if axess is None: fig,axess = plt.subplots( 3, len(sequences),False,False, squeeze=False,figsize=(len(sequences)*3,8))#'col' plt.subplots_adjust(left=0.12, bottom=0.05, right=0.95, top=0.94, wspace=0.28, hspace=0.1) plt.subplots_adjust(left=0.45, bottom=0.05, right=0.95, top=0.94, wspace=0.28, hspace=1.2) for i in range(0,len(sequences)): ax = axess[offset][i] seq = sequences[i] smax =max(seq) smin =min(seq) if distir[i]==0: #print seq freqs , bin_edges = np.histogram(seq, smax+1 if smax>1 else 100, range = (0,smax+1) if smax>1 else (0,smax))#, normed = True, density=True) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2. vals = range(0,smax+1) if smax>1 else bin_centers freqs=freqs*1.0/sum(freqs) #remove zeros y = np.array(freqs) nz_indexes = np.nonzero(y) y = y[nz_indexes] x = np.array(vals)[nz_indexes] ax.plot(x, y,':', label=labl, alpha =alfa, color = clr , marker ='.') else : X = np.array(seq) X = [ x for x in X if x>=limits[i][0] and x<=limits[i][1]] # X= (np.abs(X)) # print len(X) X = np.random.choice(X, size=min(10000, len(X))) X = X[:, np.newaxis] kde = KernelDensity(kernel = 'gaussian', bandwidth=bandwidths[i]).fit(X)#,atol=atols[i],kernel = 'tophat'kernel='gaussian' # if 'x' in logscale[i] : # X_plot = np.logspace( limits[i][0], limits[i][1], 1000)[:, np.newaxis] # else : X_plot = np.linspace(limits[i][0], limits[i][1], 1000)[:, np.newaxis] log_dens = kde.score_samples(X_plot) # # ax.fill(X_plot[:, 0], np.exp(log_dens), alpha =0.5, label=labl) Y = np.exp(log_dens) if distir[i]==2: Y = np.cumsum(Y) ax.plot(X_plot[:, 0],Y, '-',label=labl, alpha =alfa, color = clr ,markersize=2, marker ='') verts = [(limits[i][0]-1e-6, 0)] + list(zip(X_plot[:, 0],Y)) + [(limits[i][1]+1e-6, 0)] poly = Polygon(verts, facecolor=clr, alpha =alfa ) #, edgecolor='0.5') ax.add_patch(poly) # ax.set_yticks([]) # ax.set_ylim(bottom=-0.02) ax.set_xlim(limits[i][0],limits[i][1]) if len(logscale)==len(sequences): if 'x' in logscale[i] : ax.set_xscale('log') if 'y' in logscale[i] : ax.set_yscale('log') if i<3: ax.set_ylim(bottom=0.001) # ax.legend() # plt.show(block=False) return axess
def pdf(data: list): # hist, bin = np.histogram(data, bins=50) # return hist kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit([[x] for x in data]) b = [[x] for x in np.linspace(min(data), max(data), 100)] a = np.exp(kde.score_samples(b)) return a
def find_max_density_point(point_list): point_list, _ = remove_nan(point_list) if point_list.shape[0] == 0: return [float('nan'),float('nan'),float('nan')] kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(point_list) prob_list = kde.score_samples(point_list) max_point = point_list[np.argmax(prob_list)] # print "max", max_point return max_point
def histLine(axes, data, minmax, color): (xmin, xmax) = minmax data = data.reshape(-1, 1) kde = KernelDensity(bandwidth=(xmax-xmin)/100.0).fit(data) x = np.linspace(xmin, xmax, 100).reshape(-1, 1) foo = kde.score_samples(x) density = np.exp(foo) axes.plot(x, density, color=color)
def createfeatmat(N): grid = getgridcoords(N).T featmat = np.zeros((len(vals), N ** 2)) for i in range(len(vals)): m = np.array([vals[i][0], vals[i][1]]).T k = KernelDensity(bandwidth=0.5 / (N - 1), kernel="gaussian") k.fit(m) featmat[i, :] = k.score_samples(grid) return featmat
def estimate_distribution(samples, h=0.1, n_points=100): kde = KernelDensity(bandwidth=h) min_xs = min(samples) max_xs = max(samples) samples = samples[:, np.newaxis] kde.fit(samples) xs = np.linspace(min_xs, max_xs, n_points) ys = np.exp(kde.score_samples(xs[:, np.newaxis])) print xs.shape, ys.shape, sum(ys) return xs, ys
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs): """Kernel Density Estimation with Scikit-learn""" kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs) kde_skl.fit(x[:, np.newaxis]) # score_samples() returns the log-likelihood of the samples log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis]) N = np.trapz(np.exp(log_pdf), x_grid) return np.exp(log_pdf)/N
def fit(self, X, y): a = np.zeros((24, 7)) hours = np.copy(X[:, 1]) weekdays = np.copy(X[:, 2]) hours = 23 * normalize(hours) weekdays = 6 * normalize(weekdays) if self.strategy == 'mean': counts = a.copy() for i, row in enumerate(zip(hours, weekdays)): hour = int(row[0]) day = int(row[1]) counts[hour, day] += 1 a[hour, day] += y[i] counts[counts == 0] = 1 self._model = a / counts elif self.strategy in ('median', 'kernel'): # this is a 3d array groups = [[[] for i in range(7)] for j in range(24)] for i, row in enumerate(zip(hours, weekdays)): hour = int(row[0]) day = int(row[1]) groups[hour][day].append(y[i]) if self.strategy == 'median': for i, j in np.ndindex((24, 7)): if groups[i][j]: a[i,j] = np.median(groups[i][j]) else: a[i,j] = np.nan elif self.strategy == 'kernel': # kernel method computes a kernel density for each of the # bins and determines the most probably value ('mode' of sorts) grid = np.linspace(np.min(y), np.max(y), 1000)[:, np.newaxis] for i, j in np.ndindex((24, 7)): if groups[i][j]: npgroups = np.array(groups[i][j])[np.newaxis] kernel = KernelDensity(kernel='gaussian', \ bandwidth=0.2).fit(npgroups.T) density = kernel.score_samples(grid) dmax = np.max(density) imax = np.where(density==dmax) a[i,j] = grid[imax, 0] else: a[i,j] = np.nan self._model = a # smooth the model here if there are nans return self
def plot_stan_trc(dftrc): """ Create simple plots of parameter distributions and traces from output of pystan sampling. Emulates pymc traceplots. """ fig, ax2d = plt.subplots(nrows=dftrc.shape[1], ncols=2, figsize=(14, 1.8*dftrc.shape[1]), facecolor='0.99', edgecolor='k') fig.suptitle('Distributions and traceplots for {} samples'.format( dftrc.shape[0]),fontsize=14) fig.subplots_adjust(wspace=0.2, hspace=0.5) k = 0 # create density and traceplot, per parameter coeff for i, (ax1d, col) in enumerate(zip(ax2d, dftrc.columns)): samples = dftrc[col].values scale = (10**np.round(np.log10(samples.max() - samples.min()))) / 20 kde = KernelDensity(bandwidth=scale).fit(samples.reshape(-1, 1)) x = np.linspace(samples.min(), samples.max(), 100).reshape(-1, 1) y = np.exp(kde.score_samples(x)) clr = sns.color_palette()[0] # density plot ax1d[0].plot(x, y, color=clr, linewidth=1.4) ax1d[0].vlines(np.percentile(samples, [2.5, 97.5]), ymin=0, ymax=y.max()*1.1, alpha=1, linestyles='dotted', colors=clr, linewidth=1.2) mn = np.mean(samples) ax1d[0].vlines(mn, ymin=0, ymax=y.max()*1.1, alpha=1, colors='r', linewidth=1.2) ax1d[0].annotate('{:.2f}'.format(mn), xy=(mn,0), xycoords='data' ,xytext=(5,10), textcoords='offset points', rotation=90 ,va='bottom', fontsize='large', color='#AA0022') ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':10}) # traceplot ax1d[1].plot(np.arange(len(samples)),samples, alpha=0.2, color=clr, linestyle='solid' ,marker=',', markerfacecolor=clr, markersize=10) ax1d[1].hlines(np.percentile(samples,[2.5, 97.5]), xmin=0, xmax=len(samples), alpha=1, linestyles='dotted', colors=clr) ax1d[1].hlines(np.mean(samples), xmin=0, xmax=len(samples), alpha=1, colors='r') k += 1 ax1d[0].set_title('{}'.format(col), fontdict={'fontsize':14})#,'fontweight':'bold'}) #ax1d[0].legend(loc='best', shadow=True) _ = [ax1d[j].axes.grid(True, linestyle='-', color='lightgrey') for j in range(2)] plt.subplots_adjust(top=0.94) plt.show()
def densityEst(a,x,p,knn=1,Mode='G'): """ This is a density estimation currently supporting one-dimensional Data. There are two modes of operation: knn==0 (Default) use fixed bandwidth. knn==1 use k nearest neigbors. Tow types of kernel are supported: Mode=='T' (Default) for triangular. Mode=='G' for Gaussian. a is a vector of samples. p is the parameter of model (bandwidth when knn=0 of number of neighbors otherwise. x is points of estimation """ N=len(x) x.resize(N,1) l=len(a) a=num.array(a) a.resize(l,1) if knn==0: try: from sklearn.neighbors.kde import KernelDensity except ImportError: print 'Error:Please install sklearn package...' return if Mode=='T': S='linear' elif Mode=='G': S='gaussian' else: print 'Currently only G(gaussian) and T(triangular) Modes are supported' return kde = KernelDensity(kernel=S, bandwidth=p).fit(a) return (x,num.exp(kde.score_samples(x))) elif knn==1: try: from sklearn.neighbors import NearestNeighbors except ImportError: print 'Error:Please install sklearn package...' return neigh = NearestNeighbors(n_neighbors=p) neigh.fit(a) dist,index=neigh.kneighbors(x) H=dist[:,-1] est=[0.0]*N for i,point_v in enumerate(x): point=point_v[0] h=H[i] est[i]=sum(kernel((a-point)/h,Mode))/(l*h) return (x,est) else: print 'knn must be 0 or 1' return
def train_rlos(data, show_chart=False): """Train LOS estimator""" """Train patient LOS for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] rlos = int(row["rlos"]) if rlos == 0: print "RLOS equals zero for sex %d, age %d, SL %s" % (sex, age, sline) tuple = (sex, age, sline) freq.setdefault(tuple, []) freq[tuple].append(rlos) result = {} for tuple, train_data in freq.items(): (sex, age, sline) = tuple if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [round(l[0]) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("RLOS train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('RLOS') plt.hist(samples) plt.show() return result
def pda_single(synth_data, data, bandwidth=.1): #synth_data = np.log(np.abs(synth_data))[:, np.newaxis] #data_log = np.log(np.abs(data))[:, np.newaxis] synth_data = synth_data[:, np.newaxis] data = data[:, np.newaxis] if bandwidth == 'silverman': lower, upper = scoreatpercentile(synth_data, [25, 75]) iqr = upper - lower sd = np.std(synth_data) bandwidth = .9 * min(sd, iqr/1.34) * len(data)**(-1./5) kde = KernelDensity(kernel='epanechnikov', bandwidth=bandwidth).fit(synth_data) return kde.score_samples(data)
def train_admit_count(data, show_chart=False): """Train patient admittance number for triplet (sex, age, sline)""" freq = {} for row in data: sex = int(row["sex"]) age = fp.split_age(int(row["age"])) sline = row["sline"] admit = row["admit"] tuple = (sex, age, sline) freq.setdefault(tuple, {}) freq[tuple].setdefault(admit, 0) freq[tuple][admit] += 1 result = {} for tuple, days in freq.items(): (sex, age, sline) = tuple train_data = days.values() if len(train_data) < training_threshold: print "Too small training set (<%d) for sex %d, age %d, SL %s. Data will be skipped. " % \ (training_threshold, sex, age, sline) continue X = np.array([train_data]).transpose() kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(X) kdef = lambda size: [int(round(l[0])) for l in kde.sample(size).tolist()] result[tuple] = kde if show_chart: # print "Sex=%d, Age=%d, SL=%s" % (sex, age, sline) # print_freq(ages) samples = kdef(len(train_data)) if len(train_data) < 500 else kdef(500) # print_freq(samples) # hist for train data plt.subplot(211) plt.title("Admit count train data for Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(train_data) # estimated density plt.subplot(212) plt.title("Estimated density Sex=%d, Age=%d, SL=%s" % (sex, age, sline)) plt.ylabel('freq') plt.xlabel('admittance count') plt.hist(samples) plt.show() return result
def find_centroid(data, bandwidth=0.003, iter_num=6, halfwidth=0.02): kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data) grid = 10 position = np.array([0,0]) #halfwidth = 0.02 for i in range(iter_num): low = position-halfwidth high = position+halfwidth X, Y = np.mgrid[low[0]:high[0]:20j, low[1]:high[1]:20j] positions = np.vstack([X.ravel(), Y.ravel()]).T img = kde.score_samples(positions) position = positions[np.argmax(img)] halfwidth = halfwidth*2./(grid-1.) return position
def __init__(self, optimizer=None, reward_model=None, mode_classifier=KNeighborsClassifier, mode_args=None): if reward_model is None: self.reward_model = GPRewardModel() else: self.reward_model = reward_model self.reward_model_fitted = False self.mode_classifier = mode_classifier if mode_args is None: self.mode_args = {'weights': 'distance'} self.states = [] self.actions = [] self.rewards = [] self.clusters = None self.clusters_init = False self.cluster_actions = [] self.cluster_rewards = [] self.active_clusters = [] self.n_modes = 0 self.sa_kde = KernelDensity() # TODO if optimizer is None: self.optimizer = BFGSOptimizer(mode='max', num_restarts=3) self.optimizer.lower_bounds = -1 self.optimizer.upper_bounds = 1 # TODO else: self.optimizer = optimizer
def test_density_plot(): fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) N=20 X = np.concatenate((np.random.normal(0, 1, 0.3 * N), np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis] print np.shape(X) X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] print np.shape(X_plot) kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[0,0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[0,0].text(-3.5, 0.31, "Gaussian Kernel Density") ax[0,0].plot(X[:, 0], np.zeros(X.shape[0]) - 0.01, '+k') plt.show()
def makeKDE(m): m = m[(m>-100) & (m<100)] m = m[:, np.newaxis] # Training data l = len(m) sigma = np.std(m) kdebw = (1.*4/3*sigma**5/ l)**(1./5.) try: X_plot = np.linspace(rng[0], rng[1], 1000)[:, np.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=kdebw).fit(m) log_dens = kde.score_samples(X_plot) log_dens_exp = np.exp(log_dens) KDE_mag = np.float(X_plot[np.argmax(log_dens_exp)]) except ValueError: log_dens_exp = np.ones(len(X_plot[:,0]))*-99.99 KDE_mag, sigma = -99.99, -99.99 return X_plot, log_dens_exp, KDE_mag, sigma
class OneClassKDE(BaseClassifier): _fit_params = ["bandwidth"] def __init__(self, *args, **kwargs): self.bandwidth = kwargs["bandwidth"] def fit(self, data, **kwargs): #self.train_data = data self.kde = KernelDensity(kernel='gaussian', bandwidth=self.bandwidth) self.kde.fit(data) self.training_score = self.kde.score_samples(data) self.direct_thresh = numpy.percentile(self.training_score, 10) def predict(self, data): score = self.kde.score_samples(data) self.score = score return (score < self.direct_thresh).astype(numpy.int32)*-2+1 def decision_function(self, data): return self.score
def plot_2d(i1, i2): #px = pca.components_[i1] #py = pca.components_[i2] #xlabel, ylabel = [], [] #for i in xrange(len(px)): # xlabel.append('%.2f %s' % (px[i], colnames[i])) # ylabel.append('%.2f %s' % (py[i], colnames[i])) #ax = plt.axes() #ax.yaxis.set_label_coords(-0.05, 0.2) plt.clf() xy = np.vstack([output[:, i1], output[:, i2]]).T kde = KernelDensity(kernel='tophat', bandwidth=0.01, leaf_size=10).fit(xy) z = kde.score_samples(xy) # Sort the points by density, so that the densest points are plotted last idx = z.argsort() x, y, z = output[idx, i1], output[idx, i2], z[idx] plt.xlabel('Component %i' % i1) plt.ylabel('Component %i' % i2) plt.scatter(x, y, c=z, s=10, edgecolor='') plt.savefig('ML_data/%s_pca_%s_%s.png' % (name, i1, i2))
def chart_by_time(): weekday_amrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [7,8,9]] weekday_pmrush = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [17,18,19]] weekday_midday = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [10,11,12,13,14,15,16]] weekday_night = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6] and traj[0].hour in [20,21,22,23,0,1,2,3,4,5,6]] weekend = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]] weekday_amrush_avg = sum(weekday_amrush) / float(len(weekday_amrush)) weekday_pmrush_avg = sum(weekday_pmrush) / float(len(weekday_pmrush)) weekday_midday_avg = sum(weekday_midday) / float(len(weekday_midday)) weekday_night_avg = sum(weekday_night) / float(len(weekday_night)) weekend_avg = sum(weekend) / float(len(weekend)) print("weekday_amrush_avg: ", weekday_amrush_avg, "weekday_pmrush_avg: ", weekday_pmrush_avg, "weekday_midday_avg: ", weekday_midday_avg, "weekday_night_avg: ", weekday_night_avg, "weekend_avg: ", weekend_avg) x = np.linspace(min(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), max(weekday_amrush+weekday_pmrush+weekday_midday+weekday_night+weekend), 100).reshape(-1, 1) kde_weekday_amrush = KernelDensity(bandwidth=70).fit(np.array(weekday_amrush).reshape(-1, 1)) density_weekday_amrush = np.exp(kde_weekday_amrush.score_samples(x)) kde_weekday_pmrush = KernelDensity(bandwidth=70).fit(np.array(weekday_pmrush).reshape(-1, 1)) density_weekday_pmrush = np.exp(kde_weekday_pmrush.score_samples(x)) kde_weekday_midday = KernelDensity(bandwidth=70).fit(np.array(weekday_midday).reshape(-1, 1)) density_weekday_midday = np.exp(kde_weekday_midday.score_samples(x)) kde_weekday_night = KernelDensity(bandwidth=70).fit(np.array(weekday_night).reshape(-1, 1)) density_weekday_night = np.exp(kde_weekday_night.score_samples(x)) kde_weekend = KernelDensity(bandwidth=70).fit(np.array(weekend).reshape(-1, 1)) density_weekend = np.exp(kde_weekend.score_samples(x)) plt.plot(x, density_weekday_amrush, 'r') plt.plot(x, density_weekday_pmrush, 'y') plt.plot(x, density_weekday_midday, 'g') plt.plot(x, density_weekday_night, 'b') plt.plot(x, density_weekend, 'm') plt.xlabel("Time start to endpoint") plt.ylabel("Density") plt.show()
def simplify_data2(x,y,size): avg=[] result=[] kde = KernelDensity(kernel='tophat', bandwidth=0.5).fit(x) s = np.linspace(0,size,len(x)) e = kde.score_samples(s.reshape(-1,1)) mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0] start=0 for i in mi: val=np.average(x[start:i]) for j in xrange(start,i): result.append(val) start=i val=np.average(x[start:]) for j in xrange(start,len(x)): result.append(val) #plt.plot(s, e*0.01+e[mi[0]]) print mi print ma plt.plot(s,x.reshape(1,-1)[0]) plt.plot(s,result) #print x, len(x) plt.show()
def chart_by_day(): # # On average, trips on the weekend take less time than trips on weekdays # 1337 sec versus 1446 sec # weekend_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() in [5,6]] weekday_times = [sum(traj[1:]) for traj in trajs_with_time if traj[0].weekday() not in [5,6]] weekend = sum(weekend_times) / float(len(weekend_times)) weekday = sum(weekday_times) / float(len(weekday_times)) print("weekend: ", weekend, "weekday: ", weekday) x = np.linspace(min(weekend_times + weekday_times), max(weekend_times + weekday_times), 100).reshape(-1, 1) kde_weekend = KernelDensity(bandwidth=100).fit(np.array(weekend_times).reshape(-1, 1)) density_weekend = np.exp(kde_weekend.score_samples(x)) kde_weekday = KernelDensity(bandwidth=100).fit(np.array(weekday_times).reshape(-1, 1)) density_weekday = np.exp(kde_weekday.score_samples(x)) plt.plot(x, density_weekend, 'r') plt.plot(x, density_weekday, 'b') plt.xlabel("Time start to Grand Ave: red: weekend, blue, weekday") plt.ylabel("Density") plt.show()
def density_estimation(data, img, imagepath): img = io.imread(imagepath, as_gray=True) print(data.shape) data = data * 4 l = data.shape[0] // 2 x = data[::2, :].ravel() y = data[1::2, :].ravel() xmin, xmax = np.min(x), np.max(x) ymin, ymax = np.min(y), np.max(y) X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = np.vstack([X.ravel(), Y.ravel()]) values = np.vstack([x, y]) kernel = stats.gaussian_kde(values, bw_method=0.2) Z = np.reshape(kernel(positions), X.shape) fig = plt.figure() # fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[xmin, xmax, ymin, ymax]) ax.plot(x, y, '+k', markersize=0.5) ax.set_xlim([xmin, xmax]) ax.set_ylim([ymin, ymax]) plt.gca().invert_yaxis() ax.axis('off') plt.show() fig, ax_ = plt.subplots() ax_.imshow(img, cmap=plt.cm.gray) # ax_.set_title('KDE') # ax_.pcolormesh(X, Y, Z, shading='goudaud', alpha=0.4, cmap=plt.cm.gist_earth_r) #ax_.contourf(X, Y, Z, alpha=0.45, cmap=plt.cm.gist_earth_r) kde_ = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data.T) sc = kde_.score_samples(data.T) max = np.argmax(np.exp(sc)) shape = data[:, max] shape = np.reshape(shape, (l, 2)) # shape *= 8 # show_landmarks_detected(shape) a = shape[:, 0] b = shape[:, 1] # plt.plot(a,b, 'r.') # plt.show() # plt.imshow(img, cmap=plt.cm.gray) print(a[0]) print(b[0]) ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) ax_.plot(a[0], b[0], 'b.', markersize=8, mec='k', mew=0.3) ax_.axis('off') #plt.scatter(x, y, c='k', s=2, edgecolor='white') plt.show()
return data, lab X, y = gen_cb(5000, .25, 3.14159 / 4) X_test, y_test = gen_cb(5000, .25, 3.14159 / 4) plt.figure() plt.title('Initial checker board data plot') plt.plot(X[np.where(y == 1)[0], 0], X[np.where(y == 1)[0], 1], 'o') plt.plot(X[np.where(y == 2)[0], 0], X[np.where(y == 2)[0], 1], 's', c='r') # plt.show() X1 = X[np.where(y == 1)[0], :] X2 = X[np.where(y == 2)[0], :] # Kernel density functions kdfX1 = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(X1) kdfX2 = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(X2) score1 = kdfX1.score_samples(X_test) score2 = kdfX2.score_samples(X_test) score1Exp = math.e**(score1) score2Exp = math.e**(score2) Y = [] i = 0 for i in range(len(X_test)): if score1Exp[i] > score2Exp[i]: Y.append(1) else: Y.append(2)
# part 1: category the locations # test f, ax = plt.subplots(2, 2) plotX = False if plotX == True: x = Cx X_plot = np.linspace(-180, 180, len(x))[:, np.newaxis] else: x = Cy X_plot = np.linspace(-90, 90, len(x))[:, np.newaxis] # KDE kde = KernelDensity(kernel='epanechnikov', bandwidth=0.05).fit(x) # gaussian log_dens = kde.score_samples(X_plot) ax[0, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') # ax[0,0].text(x = 'left',y = 'bottom',s= "epanechnikov Kernel, size =50k, b=0.05") ax[0, 0].set_title(label="epanechnikov Kernel, size =" + str(size / 1000) + "k, b=0.05", loc="center") # KDE kde = KernelDensity(kernel='epanechnikov', bandwidth=0.75).fit(x) # gaussian log_dens = kde.score_samples(X_plot) ax[0, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[0, 1].set_title(label="epanechnikov Kernel, size =" + str(size / 1000) + "k, b=0.75", loc="center") # KDE kde = KernelDensity(kernel='epanechnikov', bandwidth=2.25).fit(x) # gaussian
file_name = 'SSB' file_path = r'../data/' + position + '/' + file_name + '_Tsne.csv' file_write_path = r'../data/' + position + '/' + file_name + '_id_x_y_kde.json' print('file_path: {}'.format(file_path)) print('file_write_path: {}'.format(file_write_path)) with open(file_path) as f: ans_dict = {} temp_list = [] id_list = [] while True: line = f.readline() if not line: break line = line.replace('\n', '').split(',') id_list.append(line[0]) temp_list.append([line[1], line[2]]) ans_dict[line[0]] = {'id': line[0], 'x': line[1], 'y': line[2]} X = np.array(temp_list) kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X) kde_list = np.exp(kde.score_samples(X)) print(id_list) for i in range(len(id_list)): ans_dict[id_list[i]]['kde'] = kde_list[i] print(i) print(ans_dict) fw = open(file_write_path, 'w+') fw.write(json.dumps(ans_dict)) fw.close()
} # out.txt as Input and outf.txt as Output with open('out.txt') as infile, open('outf.txt', 'w') as outfile: for line in infile: for src, target in replacements2.iteritems(): line = line.replace(src, target) outfile.write(line) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) val = val = ast.literal_eval(open(fp).read()) size = val[len(val) - 1] kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X) print(kde.score_samples(X)) plt.title("Kernel Density Estimation") plt.plot(linspace(0, size, size * 2), kde, 'r') plt.xlabel("Sequence length") plt.ylabel("Probability density") tree = xml.parse('0b8aef4d2de04dea51228e406270662035904866.LOOPER.xml') root = tree.getroot() text_file = open("Output.txt", "w") sys.stdout = text_file for elem in tree.iter(): print >> text_file, elem.tag
class GAE: def __init__(self, img_shape=(28, 28), encoded_dim=2): self.img_shape = img_shape self.encoded_dim = encoded_dim self.optimizer = Adam(0.001) self.optimizer_discriminator = Adam(0.00001) self.discriminator = self.get_discriminator_model(img_shape) self.decoder = self.get_decoder_model(encoded_dim, img_shape) self.encoder = self.get_encoder_model(img_shape, encoded_dim) # Initialize Autoencoder img = Input(shape=self.img_shape) encoded_repr = self.encoder(img) gen_img = self.decoder(encoded_repr) self.autoencoder = Model(img, gen_img) # Initialize Discriminator latent = Input(shape=(encoded_dim,)) gen_image_from_latent = self.decoder(latent) is_real = self.discriminator(gen_image_from_latent) self.decoder_discriminator = Model(latent, is_real) # Finally compile models self.initialize_full_model(encoded_dim) def initialize_full_model(self, encoded_dim): self.autoencoder.compile(optimizer=self.optimizer, loss='mse') self.discriminator.compile(optimizer=self.optimizer, loss='binary_crossentropy', metrics=['accuracy']) # Default start discriminator is not trainable for layer in self.discriminator.layers: layer.trainable = False self.decoder_discriminator.compile(optimizer=self.optimizer_discriminator, loss='binary_crossentropy', metrics=['accuracy']) @staticmethod def get_encoder_model(img_shape, encoded_dim): encoder = Sequential() encoder.add(Flatten(input_shape=img_shape)) encoder.add(Dense(1000, activation='relu')) encoder.add(Dense(1000, activation='relu')) encoder.add(Dense(encoded_dim)) encoder.summary() return encoder @staticmethod def get_decoder_model(encoded_dim, img_shape): decoder = Sequential() decoder.add(Dense(1000, activation='relu', input_dim=encoded_dim)) decoder.add(Dense(1000, activation='relu')) decoder.add(Dense(np.prod(img_shape), activation='sigmoid')) decoder.add(Reshape(img_shape)) decoder.summary() return decoder @staticmethod def get_discriminator_model(img_shape): discriminator = Sequential() discriminator.add(Flatten(input_shape=img_shape)) discriminator.add(Dense(1000, activation='relu', kernel_initializer=initializer, bias_initializer=initializer)) discriminator.add(Dense(1000, activation='relu', kernel_initializer=initializer, bias_initializer=initializer)) discriminator.add(Dense(1, activation='sigmoid', kernel_initializer=initializer, bias_initializer=initializer)) discriminator.summary() return discriminator def imagegrid(self, epochnumber): fig = plt.figure(figsize=[20, 20]) for i in range(-5, 5): for j in range(-5, 5): topred = np.array((i * 0.5, j * 0.5)) topred = topred.reshape((1, 2)) img = self.decoder.predict(topred) img = img.reshape(self.img_shape) ax = fig.add_subplot(10, 10, (i + 5) * 10 + j + 5 + 1) ax.set_axis_off() ax.imshow(img) fig.savefig(str(epochnumber) + ".png") plt.show() plt.close(fig) def train(self, x_train_input, batch_size=128, epochs=5): fileNames = glob.glob('models/weights_mnist_autoencoder.*') fileNames.sort() if len(fileNames) != 0: saved_epoch = int(fileNames[-1].split('.')[1]) self.autoencoder.load_weights(fileNames[-1]) else: saved_epoch = -1 if saved_epoch < epochs - 1: self.autoencoder.fit(x_train_input, x_train_input, batch_size=batch_size, epochs=epochs, callbacks=[ keras.callbacks.ModelCheckpoint('models/weights_autoencoder.{epoch:02d}.hdf5', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1), keras.callbacks.EarlyStopping(monitor='loss', patience=3, min_delta=1e-4, restore_best_weights=True)]) print("Training KDE") codes = self.encoder.predict(x_train_input) self.kde = KernelDensity(kernel='gaussian', bandwidth=3.16).fit(codes) print("Initial Training of discriminator") fileNames = glob.glob('models/weights_mnist_discriminator.*') fileNames.sort() if len(fileNames) != 0: saved_epoch = int(fileNames[-1].split('.')[1]) self.discriminator.load_weights(fileNames[-1]) else: saved_epoch = -1 train_count = len(x_train_input) if saved_epoch < epochs - 1: # Combine real and fake images for discriminator training imgs_fake = self.generate(n=train_count) valid = np.ones((train_count, 1)) # result for training images fake = np.zeros((train_count, 1)) # result for generated fakes labels = np.vstack([valid, fake]) # combine together images = np.vstack([x_train_input, imgs_fake]) # Train the discriminator self.discriminator.fit(images, labels, epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[ keras.callbacks.ModelCheckpoint( 'models/weights_discriminator.{epoch:02d}.hdf5', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1), keras.callbacks.EarlyStopping(monitor='loss', patience=3, min_delta=1e-4, restore_best_weights=True)]) print("Training GAN") self.generateAndPlot(x_train_input, fileName="before_gan.png") self.trainGAN(x_train_input, epochs=int(train_count / batch_size), batch_size=batch_size) self.generateAndPlot(x_train_input, fileName="after_gan.png") def trainGAN(self, x_train_input, epochs=1000, batch_size=128): half_batch = int(batch_size / 2) for epoch in range(epochs): # ---------------Train Discriminator ------------- # Select a random half batch of images idx = np.random.randint(0, x_train_input.shape[0], half_batch) imgs_real = x_train_input[idx] # Generate a half batch of new images imgs_fake = self.generate(n=half_batch) valid = np.ones((half_batch, 1)) fake = np.zeros((half_batch, 1)) # Train the discriminator d_loss_real = self.discriminator.train_on_batch(imgs_real, valid) d_loss_fake = self.discriminator.train_on_batch(imgs_fake, fake) d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) codes = self.kde.sample(batch_size) # Generator wants the discriminator to label the generated representations as valid valid_y = np.ones((batch_size, 1)) # Train generator g_logg_similarity = self.decoder_discriminator.train_on_batch(codes, valid_y) # Plot the progress if epoch % 50 == 0: print("epoch %d [D accuracy: %.2f] [G accuracy: %.2f]" % (epoch, d_loss[1], g_logg_similarity[1])) def generate(self, n=10000): codes = self.kde.sample(n) images = self.decoder.predict(codes) return images def generateAndPlot(self, x_train_input, n=10, fileName="generated.png"): fig = plt.figure(figsize=[20, 20]) images = self.generate(n * n) index = 1 for image in images: image = image.reshape(self.img_shape) ax = fig.add_subplot(n, n + 1, index) index = index + 1 ax.set_axis_off() ax.imshow(image) if index % (n + 1) == 0: nearest = findNearest(x_train_input, image) ax = fig.add_subplot(n, n + 1, index) index = index + 1 ax.imshow(nearest) fig.savefig(fileName) plt.show() @staticmethod def mean_log_likelihood(x_test_input): KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x_test_input)
return P_norm def density(reads, K0=3000, K1=100_000, kde_method="linear"): """ Estimating density of distances between peace of reads :param reads: reads[i,0] - position first peace of read; reads[i,1] - position second peace of read :param K0: end of poly approximation :param K1: end of first log approximation :param K2: end of second log approximation :param kde_method: type of kernel in KernelDensity :return: """ distances = np.abs(reads[:, 0] - reads[:, 1]) kde = KernelDensity(kernel=kde_method, bandwidth=200).fit(distances.reshape(-1, 1)) f = lambda x: kde.score_samples(x.reshape(-1, 1)) # proximal degree = 30 x0 = np.logspace(0, np.log10(K0 + 1000), 500) param0 = np.polyfit(x0, f(x0), degree) x1 = np.logspace(np.log10(K0 - 1000), np.log10(K1), 500) p = lambda x, a, b: a + b * np.log(x) param1, cov = curve_fit(p, x1, f(x1)) P = (lambda x: np.where(x < K0, np.poly1d(param0)(x), np.where(x < K1, param1[0] + param1[1] * np.log(x), param1[0] + param1[1] * np.log(x))))
def plotTrajAlignment(): tau = -10 # transition time, use this to find the best b that gives slope a*b/4 a = 1 # b = 16/(a*tau) c = 4 d = 5 fSig = lambda x: a + d / (1 + np.exp((-4 / tau) * (x - c))) xMin = -12 xMax = 25 delta = 5 xs = np.linspace(xMin, xMax, 50) fig = pl.figure(2, figsize=(6, 4)) ax = pl.subplot(111) ax.set_frame_on(True) fig.subplots_adjust(top=0.75) # plot trajectory ax.plot(xs, fSig(xs), c=trajCol, linewidth=lw, label='Trajectory') np.random.seed(2) # plot points nrPoints = 19 xsPoints = np.array([ -10, -8.3, -7.0, -6.25, -5., -4.2, -2.4, -0.6, 0.4, 1.4, 3.2, 5.05, 6.0, 6.9, 7.9, 8.75, 9.2, 10.25, 12.5 ]) diag = np.array([1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) unqDiags = np.unique(diag) nrDiags = unqDiags.shape[0] ysPoints = fSig(xsPoints) ysPointsPerturb = [0, 0] print('xsPoints', xsPoints, ysPoints) diagLabels = ['Controls', 'Patients'] ax = pl.gca() xLim = (xMin, xMax - 9) yLim = (-4, 8) xOrigin = 0 from sklearn.neighbors.kde import KernelDensity kdes = [] kdeXs = np.linspace(xLim[0], xLim[1], num=100).reshape(-1, 1) kernelWidth = np.std( xsPoints) / 5.5 # need to test this parameter by visualisation for d in [1, 2]: ysPointsPerturb[d - 1] = ysPoints[diag == d] + np.random.normal( loc=0, scale=0.5, size=diag[diag == d].shape[0]) ax.scatter(xsPoints[diag == d], ysPointsPerturb[d - 1], marker='x', s=markerSize, linewidths=lw, c=diagCols[d], label=diagLabels[d - 1]) kdeCurr = KernelDensity(kernel='gaussian', bandwidth=kernelWidth).fit( xsPoints[diag == d].reshape(-1, 1)) scores = np.exp(kdeCurr.score_samples(kdeXs)) scaledScores = scores - np.min(scores) / (np.max(scores) - np.min(scores)) scaledScores = scaledScores * (yLim[1] - yLim[0]) * 2 + yLim[0] pl.fill_between(kdeXs.reshape(-1), yLim[0], scaledScores, facecolor=diagCols[d], alpha=0.4) maxScore = np.max(scaledScores) + 0.25 maxInd = np.argmax(scaledScores) pl.text(kdeXs[maxInd] - 1.5, maxScore, diagLabels[d - 1], color=diagCols[d]) pl.plot([xOrigin, xOrigin], [yLim[0], yLim[1]], '--', c=(0.5, 0.5, 0.5), linewidth=lw, label='Disease onset') # plt.plot(range(5), range(5), 'ro', markersize = 20, clip_on = False, zorder = 100) pl.xlim(xLim) pl.ylim(yLim) # pl.legend(ncol=2, loc='upper center') pl.legend(ncol=2, bbox_to_anchor=(0.05, 1.27, 0.95, .102)) pl.xlabel('Years since $t_0$') pl.ylabel('Biomarker Z-Score') ax.set_yticks([-4, -2, 0, 2, 4, 6, 8]) ax.set_yticklabels(['', '', '-3', '-2', '-1', '0', '1']) # ax.set_xticks(ax.get_xticks()[1:-2] + xOrigin) # ax.set_xticklabels(['-10', '0', '10']) pl.gcf().subplots_adjust(left=0.13, bottom=0.15, top=0.75) # ax.yaxis.set_label_coords(-0.1, 0.5) # ax.set_xlim((xLim[0] + xLimShift, xLim[1] + xLimShift)) # boxprops = dict(linestyle = '--', linewidth = 3, color = 'blue') # medianprops = dict(linestyle = '-.', linewidth = 0.1, color = 'firebrick') # ax2 = pl.axes([0.03, 0, 0.1, 1], facecolor = (1, 1, 1, 0)) # ax2.set_frame_on(False) # ax2.set_xlim((0,1)) # ax2.set_ylim(yLim) # ax2.set_yticks([]) # boxPos = [np.array([1.25]), np.array([1])] # # yDisp = [-0.3, 0.6] # yDisp = [-1.75, 0] # yScale = [1.2, 1] # yDisp = [11.5, 0] # yScale = [-1.2, 1] # nrDiags = 2 # for d in range(nrDiags): # print('ys %d ' % d, ysPointsPerturb[d]*yScale[d]+yDisp[d]) # bp = ax2.boxplot(ysPointsPerturb[d]*yScale[d]+yDisp[d], notch=0, sym='rs', vert=True, whis=1.75, widths=[0.1], # positions=boxPos[d], showfliers=False, patch_artist=True, showmeans=True, medianprops=medianprops) # pylab.setp(bp['boxes'], color = diagCols[unqDiags[d]]) # make new axis ax3, with 0 - 1 limits # ax3 = pl.axes([0,0,1,1], facecolor=(1,1,1,0)) # ax3.set_frame_on(False) # # #x,y = np.array([[0.05, 0.1, 0.9], [0.05, 0.5, 0.9]]) # #line = lines.Line2D(x, y, lw=5., color='r', alpha=0.4) # ax3.set_xlim((0, 1)) # ax3.set_ylim((0, 1)) # # ax3.plot([0.1,0.56], [0.36, 0.36], '--', c=(0.5,0.5,0.5), linewidth=lw) # ax3.set_yticks([]) # adjustFig(maxSize = (400, 400)) fig.show() return fig
def _bivariate_kdeplot(x, y, xscale=None, yscale=None, shade=False, bw="scott", gridsize=50, cut=3, clip=None, legend=True, legend_data = None, **kwargs): ax = plt.gca() # Determine the clipping clip = [(-np.inf, np.inf), (-np.inf, np.inf)] x = xscale(x) y = yscale(y) x_nan = np.isnan(x) y_nan = np.isnan(y) x = x[~(x_nan | y_nan)] y = y[~(x_nan | y_nan)] if bw == 'scott': bw_x = bw_scott(x) bw_y = bw_scott(y) bw = (bw_x + bw_y) / 2 elif bw == 'silverman': bw_x = bw_silverman(x) bw_y = bw_silverman(y) bw = (bw_x + bw_y) / 2 elif isinstance(bw, float): bw_x = bw_y = bw else: raise util.CytoflowViewError(None, "Bandwith must be 'scott', 'silverman' or a float") kde = KernelDensity(bandwidth = bw, kernel = 'gaussian').fit(np.column_stack((x, y))) x_support = _kde_support(x, bw_x, gridsize, cut, clip[0]) y_support = _kde_support(y, bw_y, gridsize, cut, clip[1]) xx, yy = np.meshgrid(x_support, y_support) z = kde.score_samples(np.column_stack((xx.ravel(), yy.ravel()))) z = z.reshape(xx.shape) z = np.exp(z) n_levels = kwargs.pop("n_levels", 10) color = kwargs.pop("color") kwargs['colors'] = (color, ) x_support = xscale.inverse(x_support) y_support = yscale.inverse(y_support) xx, yy = np.meshgrid(x_support, y_support) contour_func = ax.contourf if shade else ax.contour try: cset = contour_func(xx, yy, z, n_levels, **kwargs) except ValueError as e: raise util.CytoflowViewError(None, "Something went wrong in {}, bandwidth = {}. " .format(contour_func.__name__, bw)) from e num_collections = len(cset.collections) min_alpha = kwargs.pop("min_alpha", 0.2) if shade: min_alpha = 0 max_alpha = kwargs.pop("max_alpha", 0.9) alpha = np.linspace(min_alpha, max_alpha, num = num_collections) for el in range(num_collections): cset.collections[el].set_alpha(alpha[el]) # Label the axes if hasattr(x, "name") and legend: ax.set_xlabel(x.name) if hasattr(y, "name") and legend: ax.set_ylabel(y.name) # Add legend data if 'label' in kwargs: legend_data[kwargs['label']] = plt.Rectangle((0, 0), 1, 1, fc = color) return ax
def game(players, *ps, **kwargs): ranks = {} for p in ps: # Record what place they got (0 is first place bc python is stupid) r = [x for x in range(len(ps)) if ps[x] == p] ranks[p] = r[0] if p not in players: print('Add "' + p + '" to players object with add_player()') return # Simulate n games n = 100000 if 'n' in kwargs: n = int(kwargs['n']) # Define function that represents a single simulation def single_simulation(ps): lambs = {} turns = {} # Cycle through each person to get their game performance for p in ps: # Simulate a true AVERAGE of turns for each person from their prior # distribution. lambs[p] = abs(players[p][-1].sample(1)) # From this average generate how many turns it would take them to # finish a game from a Poisson distribution. Generating multiple # numbers per game to serve as a tie breaker. turns[p] = list(np.random.poisson(lambs[p], 10)) # Record the final position of each player in the simulated game result = sorted(turns.keys(), key=lambda x: turns[x]) s_rank = {} for i in range(len(result)): s_rank[result[i]] = i return ([s_rank, lambs]) # Do this by repeating ps n times in a list ps_list = itertools.repeat(ps, n) games = list(map(single_simulation, ps_list)) # Pull out the lambdas that match the games that occured matching_results = [x[1] for x in games if x[0] == ranks] matching_n = len(matching_results) print(matching_n, 'matching games, or', round(matching_n / n, 3) * 100, 'percent') # if matching_results is less than 1000, then run the expected number of # iterations to get up to 1500 matching game results. if matching_n < 1000: addl_games = round(((n / matching_n) * (1500 - matching_n))) print('Too few matching results, running', addl_games, 'more simulations') # Run additional simulations ps_list = itertools.repeat(ps, addl_games) tmp = map(single_simulation, ps_list) games += tmp matching_results = [x[1] for x in games if x[0] == ranks] print('Now', len(matching_results), 'matching games') # Trying to build a density off of more than 1000 points is computationally # expensive and pretty pointless. Sample out 1000 matching games to use for # density approximations. matching_results = random.sample(matching_results, 1000) # Calculate density approximation for everyone # This could be done in parallel to speed things up a little, but with the # 1000 game limit in matching_results, this shouldn't be too slow. for p in ps: # Pull out their matching turns to build matching distribution md = np.array([x[p][0][0] for x in matching_results]).reshape(-1, 1) # Determine the best bandwidth using the same method as in the # beginning of the script. upper = 1.06 * md.std() lower = 1.06 * md.std() / 20 rng = np.arange(lower, upper, (upper - lower) / 10) bws = {} for bw in rng: kde = KernelDensity(bandwidth=bw) s = cross_val_score(kde, md, cv=5).mean() bws[bw] = s fbw = max(bws.keys(), key=lambda x: bws[x]) players[p].append(KernelDensity(bandwidth=fbw).fit(md))
# keep the formatting consistent, it would be nice to have the type of object # be the same for the first prior as all the other priors. As a result, we'll # build a density approximation of a normal distribution and use that. # Build density approximation using scikit learn: # http://scikit-learn.org/stable/modules/cross_validation.html # Use CV to get best bandwidth. Use kde.score() as the evualation metric. X = np.array(np.random.normal(15, 3, 1000)).reshape(-1, 1) upper = 1.06 * X.std() lower = 1.06 * X.std() / 20 rng = np.arange(lower, upper, (upper - lower) / 20) bws = {} for bw in rng: kde = KernelDensity(bandwidth=bw) s = cross_val_score(kde, X, cv=5).mean() bws[bw] = s fbw = max(bws.keys(), key=lambda x: bws[x]) kde = KernelDensity(bandwidth=fbw).fit(X) # Define empty players dictionary players = {} # Define function that represents a single simulation def single_simulation(ps): lambs = {} turns = {}
def cross_validate(test_data,bandwidths,n_folds=5): params = {'bandwidth': bandwidths} kf = KFold(n=len(test_data),n_folds=n_folds,shuffle=True,random_state=0) grid = GridSearchCV(KernelDensity(), params,cv=kf) grid.fit(test_data) return grid.best_estimator_.bandwidth,grid
def kde_histogram(x,x_range=None,bandwidth=None,fill=False,fill_properties=None, line_properties=None,n_folds=3,printout=False,N_max=1000,zorder=0): ''' --- A 1D method for plotting a kernel density estimate (rather than a histogram, for example) --- Inputs: ------- x: x data x_range: range of the data. If None, then all of the *finite* data is used. fill: if True, the histogram will have a fill colour. fill_properties: _dictionary_ of terms for the histogram fill. Can take the keys 'color' and 'alpha'. Default is 'k' and 0.5. line_properties: _dictionary_ of terms for the line properties. Can have the keys 'color', 'alpha', 'linewidth', and 'linestyle' (defaults: 'k', 1, 1, 'solid'). n_folds: number of folds for the cross validation if no bandwidth is provided. printout: if True, then the optimised bandwidth will be returned. N_max: maximum number of points to do the cross-validation on. If more data points are provided, a random selection will be used. zorder: where to 'overlay' the plot. Outputs: -------- x_range: range of the data. bandwidth: bandwidth of the KDE. ''' # set the line + fill properties here: #################################### fp = {'color':'k', 'alpha':0.5} lp = {'color':'k', 'alpha':1, 'linewidth':1, 'linestyle':'solid'} if line_properties != None: for l in line_properties.keys(): lp[l] = line_properties[l] if fill_properties != None: for f in fill_properties.keys(): fp[f] = fill_properties[f] #################################### np.random.seed(0) # keep only the finite, 'good' data, or the data that is # within the range of x specified: if x_range == None: select_x = np.isfinite(x) x_range = [np.min(x),np.max(x)] else: select_x = (x >= x_range[0]) & (x < x_range[1]) x = x[select_x][:,np.newaxis] x_std = np.std(x) # for scaling the cross-validation inputs if len(x) > N_max: x_test = np.random.choice(x.squeeze(),size=N_max,replace=False) x_test = x_test[:,np.newaxis] else: x_test = x.copy() if bandwidth == None: N_steps = 100 bandwidths = np.logspace(-2,0,N_steps)*x_std bandwidth, grid = cross_validate(x_test,bandwidths,n_folds) if printout: print('Optimal bandwidth found: {0:.3f}'.format(bandwidth)) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(x) plot_x = np.linspace(x_range[0]-x_std,x_range[1]+x_std,100)[:,np.newaxis] plot_y = np.exp(kde.score_samples(plot_x)) plot_x,plot_y = [plot_x.squeeze(),plot_y.squeeze()] if fill == True: _ = plt.fill_between(plot_x,0,plot_y,color=fp['color'],alpha=fp['alpha'],zorder=zorder) _ = plt.plot(plot_x,plot_y,color=lp['color'],alpha=lp['alpha'] ,lw=lp['linewidth'],linestyle=lp['linestyle'],zorder=zorder) return x_range,bandwidth
class DensityEstimator: def __init__(self, training_set, method_name, n_components=None, log_dir=None, second_stage_beta=None): self.log_dir = log_dir self.training_set = training_set self.fitting_done = False self.method_name = method_name self.second_density_mdl = None self.skip_fitting_and_sampling = False if method_name == "GMM_Dirichlet": self.model = mixture.BayesianGaussianMixture( n_components=n_components, covariance_type='full', weight_concentration_prior=1.0 / n_components) elif method_name == "GMM": self.model = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_1": self.model = mixture.GaussianMixture(n_components=1, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_10": self.model = mixture.GaussianMixture(n_components=10, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_20": self.model = mixture.GaussianMixture(n_components=20, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_100": self.model = mixture.GaussianMixture(n_components=100, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_200": self.model = mixture.GaussianMixture(n_components=200, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name.find("aux_vae") >= 0: have_2nd_density_est = False if method_name[8:] != "": self.second_density_mdl = method_name[8:] have_2nd_density_est = True self.model = VaeModelWrapper( input_shape=(training_set.shape[-1], ), latent_space_dim=training_set.shape[-1], have_2nd_density_est=have_2nd_density_est, log_dir=self.log_dir, sec_stg_beta=second_stage_beta) elif method_name == "given_zs": files = os.listdir(log_dir) for z_smpls in files: if z_smpls.endswith('.npy'): break self.z_smps = np.load(os.path.join(log_dir, z_smpls)) self.skip_fitting_and_sampling = True elif method_name.upper() == "KDE": self.model = KernelDensity(kernel='gaussian', bandwidth=0.425) # self.model = KernelDensity(kernel='tophat', bandwidth=15) else: raise NotImplementedError("Method specified : " + str(method_name) + " doesn't have an implementation yet.") def fitorload(self, file_name=None): if not self.skip_fitting_and_sampling: if file_name is None: self.model.fit(self.training_set, self.second_density_mdl) else: self.model.load(file_name) self.fitting_done = True def score(self, X, y=None): if self.method_name.upper().find( "AUX_VAE") >= 0 or self.skip_fitting_and_sampling: raise NotImplementedError( "Log likelihood evaluation for VAE is difficult. or skipped") else: return self.model.score(X, y) def save(self, file_name): if not self.skip_fitting_and_sampling: if self.method_name.find('vae') >= 0: self.model.save(file_name) else: with open(file_name, 'wb') as f: pickle.dump(self.model, f) def reconstruct(self, input_batch): if self.method_name.upper().find("AUX_VAE") < 0: raise ValueError("Non autoencoder style density estimator: " + self.method_name) return self.model.reconstruct(input_batch) def get_samples(self, n_samples): if not self.skip_fitting_and_sampling: if not self.fitting_done: self.fitorload() scrmb_idx = np.array(range(n_samples)) np.random.shuffle(scrmb_idx) if self.log_dir is not None: pickle_path = os.path.join(self.log_dir, self.method_name + '_mdl.pkl') with open(pickle_path, 'wb') as f: pickle.dump(self.model, f) if self.method_name.upper() == "GMM_DIRICHLET" or self.method_name.upper() == "AUX_VAE" \ or self.method_name.upper() == "GMM" or self.method_name.upper() == "GMM_1" \ or self.method_name.upper() == "GMM_10" or self.method_name.upper() == "GMM_20" \ or self.method_name.upper() == "GMM_100" or self.method_name.upper() == "GMM_200"\ or self.method_name.upper().find("AUX_VAE") >= 0: return self.model.sample(n_samples)[0][scrmb_idx, :] else: return np.random.shuffle( self.model.sample(n_samples))[scrmb_idx, :] else: return self.z_smps
def mean_log_likelihood(x_test_input): KernelDensity(kernel='gaussian', bandwidth=0.2).fit(x_test_input)
def __init__(self, training_set, method_name, n_components=None, log_dir=None, second_stage_beta=None): self.log_dir = log_dir self.training_set = training_set self.fitting_done = False self.method_name = method_name self.second_density_mdl = None self.skip_fitting_and_sampling = False if method_name == "GMM_Dirichlet": self.model = mixture.BayesianGaussianMixture( n_components=n_components, covariance_type='full', weight_concentration_prior=1.0 / n_components) elif method_name == "GMM": self.model = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_1": self.model = mixture.GaussianMixture(n_components=1, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_10": self.model = mixture.GaussianMixture(n_components=10, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_20": self.model = mixture.GaussianMixture(n_components=20, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_100": self.model = mixture.GaussianMixture(n_components=100, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name == "GMM_200": self.model = mixture.GaussianMixture(n_components=200, covariance_type='full', max_iter=2000, verbose=2, tol=1e-3) elif method_name.find("aux_vae") >= 0: have_2nd_density_est = False if method_name[8:] != "": self.second_density_mdl = method_name[8:] have_2nd_density_est = True self.model = VaeModelWrapper( input_shape=(training_set.shape[-1], ), latent_space_dim=training_set.shape[-1], have_2nd_density_est=have_2nd_density_est, log_dir=self.log_dir, sec_stg_beta=second_stage_beta) elif method_name == "given_zs": files = os.listdir(log_dir) for z_smpls in files: if z_smpls.endswith('.npy'): break self.z_smps = np.load(os.path.join(log_dir, z_smpls)) self.skip_fitting_and_sampling = True elif method_name.upper() == "KDE": self.model = KernelDensity(kernel='gaussian', bandwidth=0.425) # self.model = KernelDensity(kernel='tophat', bandwidth=15) else: raise NotImplementedError("Method specified : " + str(method_name) + " doesn't have an implementation yet.")
def variable_score(variable, parents, data): score = 0 if len(parents) == 0: #print(data) column = data[variable] #print(column) #kernel = kde.gaussian_kde(column.values) # #x = np.linspace(min(column.values), max(column.values), 1000) #print(kernel.covariance_factor()) #plt.plot(x, np.log(kernel(x))) #plt.show() #sample = kernel.resample(5000) #kernel = kde.gaussian_kde(sample) #plt.plot(x, kernel(x)) #plt.show() #start = time.time() #print(kernel.logpdf(column.values).sum()) #print("scipy: ", time.time() - start) #grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1,1.0,10)}, cv=10) #grid.fit(column.values[:, None]) #print(grid.best_params_) vals = column.values[:, np.newaxis] #x = np.linspace(min(column.values), max(column.values), 1000) #kdens = KernelDensity(kernel='gaussian', bandwidth=1, rtol=0).fit(vals) #plt.plot(x, kdens.score_samples(x[:, np.newaxis])) #plt.show() start = time.time() kdens = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-2).fit(vals) plt.plot(sorted(vals, reverse=True), kdens.score_samples(sorted(vals, reverse=True))) plt.show() print(kdens.score(vals)) print("sklearn: ", time.time() - start) #array = np.unique(data[variable].values) #plt.scatter(array, [0] * len(array)) #plt.plot(np.linspace(min(array), max(array), 1000), kernel(np.linspace(min(array), max(array), 1000)) ) #plt.show() #start = time.time() #print(column.apply(event_score, args=(kernel,)).sum()) #print("apply: ", time.time() - start) #start = time.time() #density = sm.nonparametric.KDEMultivariate(data=[column], var_type='c') #print(len(column.values), len(np.unique(column.values))) #print(np.log(density.pdf(column.values)).sum()) #print("statsmodels: ", time.time() - start) else: cols = parents + [variable] d = data[cols] #print(d) #print(d.values) samp = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(d.values).sample(5000) score1 = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(samp).score(d.values) samp = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(data[parents].values).sample(5000) score2 = KernelDensity(kernel='gaussian', bandwidth=0.2, rtol=1E-8).fit(samp).score(data[parents].values) print(variable, parents, score1, score2, score1 - score2) return score1 - score2 #print(KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)])) #plt.plot(np.linspace(-5, 5, 100), KernelDensity(bandwidth=0.2).fit([np.linspace(-5,5, 100)]).score_samples([np.linspace(-5,5, 100)])) #plt.show() return score
Xd1 = np.array(X_std) pcd = pca_data1.fit(Xd1).transform(X_std1) print(pcd.shape) # In[31]: tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 500, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 500, 1000] }] bds = 10 * np.linspace(-1, 1, 20) clf = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bds}, cv=5) clf.fit(pcd, y1) bdwidth = clf.best_params_['bandwidth'] print("Bandwidth : ", bdwidth) kde = KernelDensity(kernel='gaussian', bandwidth=bdwidth) kde.fit(pcd) print(kde) # ### GMM # In[33]: n_comps = np.arange(1, 21) clf_gauss_models = [ GaussianMixture(n_components=n, covariance_type='full').fit(pcd)
def get_flashes(self, prob_thresh=0.2): TGF_df = self.df[self.df['cluster'] == self.TGF_cluster[0]] TGF_df = TGF_df[TGF_df['prob'] > prob_thresh] #print(TGF_df) if len( TGF_df ) > 1 and self.pre_flash is not None and self.post_flash is not None: # I want to group things by times from sklearn.neighbors.kde import KernelDensity kde = KernelDensity(kernel='gaussian', bandwidth=0.25).fit( np.asarray(TGF_df['time_sep']).reshape(-1, 1)) s = np.linspace(-1000, 1000, 5000) e = kde.score_samples(s.reshape(-1, 1)) # plt.plot(s, np.exp(e)) # plt.xlim(-600,600) # plt.ylim(0,0.05) # # plt.plot(TGF_df['time_sep'],np.zeros(len(TGF_df['time_sep'])),'b*') # from scipy.signal import argrelextrema mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0] cuts = s[mi] cuts = np.insert(cuts, 0, -1000.0) cuts = np.append(cuts, 1000.0) flash_list = [] #print(self.TGF_ID) for indx, cut in enumerate(cuts): if indx == len(cuts) - 1: break group = TGF_df[TGF_df['time_sep'].between(cut, cuts[indx + 1])] for _ in group['time_sep']: flash_list.append(indx) TGF_df['flash'] = flash_list location = TGF_df.loc[ (round(TGF_df['time_sep'], 6) == round(self.TGF_time[0], 6)) & round(TGF_df['lat'], 6).isin(self.TGF_lat)].index.values self.TGF_flash = TGF_df.ix[location]['flash'].values[0] self.TGF_flash_full = TGF_df[TGF_df['flash'] == self.TGF_flash] self.TGF_df = TGF_df self.pre_flash = self.TGF_df[self.TGF_df['flash'] == self.TGF_flash - 1] self.post_flash = self.TGF_df[self.TGF_df['flash'] == self.TGF_flash + 1] if len(self.post_flash['flash']) == 0: self.pre_flash, self.post_flash = (None, None) self.TGF_flash = 0 self.dts = None self.dt_pre = None self.dt_post = None self.TGF_flash_full = None #plt.plot(cuts,np.zeros(len(cuts)), 'r.') #plt.show() else: dts = [] for flash in set(TGF_df['flash']): if flash > 0: this_flash_chunk = TGF_df[TGF_df['flash'] == flash] this_flash_start = this_flash_chunk.iloc[0]['time_sep'] prev_flash_chunk = TGF_df[TGF_df['flash'] == flash - 1] prev_flash_end = prev_flash_chunk.iloc[-1]['time_sep'] dt = this_flash_start - prev_flash_end dts.append(dt) self.dts = np.asarray(dts) self.dt_pre = self.TGF_flash_full['time_sep'].values[ 0] - self.pre_flash['time_sep'].values[-1] self.dt_post = self.post_flash['time_sep'].values[ 0] - self.TGF_flash_full['time_sep'].values[-1] else: TGF_df['flash'] = 0 self.TGF_df = TGF_df
def Kernel_density_estimate(data, var_name1, var_name2, time, z): from sklearn.neighbors.kde import KernelDensity ''' Kerne Density Estimation: from sklearn.neighbors import KernelDensity Parameters: - bandwidth: The bandwidth here acts as a smoothing parameter, controlling the tradeoff between bias and variance in the result. A large bandwidth leads to a very smooth (i.e. high-bias) density distribution. A small bandwidth leads to an unsmooth (i.e. high-variance) density distribution. 'metric': 'euclidean' (distance metric to use. Note that not all metrics are valid with all algorithms.) 'atol': 0 (The desired absolute tolerance of the result.) 'leaf_size': 40 'kernel': 'gaussian' 'rtol': 0 (The desired relative tolerance of the result. ) 'breadth_first': True 'metric_params': None 'algorithm': 'auto' ''' amp = 100 data_aux = np.ndarray(shape=((nx * ny), nvar)) data_aux[:, 0] = data[:, 0] data_aux[:, 1] = data[:, 1] * amp # construct a kernel density estimate of the distribution print(" - computing KDE in spherical coordinates") # kde = KernelDensity(bandwidth=0.04, metric='haversine', # kernel='gaussian', algorithm='ball_tree') # kde.fit(Xtrain[ytrain == i]) # Plotting n_sample = 100 x_ = np.linspace(np.amin(data[:, 0]), np.amax(data[:, 0]), n_sample) y_ = np.linspace(np.amin(data[:, 1]), np.amax(data[:, 1]), n_sample) X, Y = np.meshgrid(x_, y_) XX = np.array([X.ravel(), Y.ravel()]).T x_aux = np.linspace(np.amin(data_aux[:, 0]), np.amax(data_aux[:, 0]), n_sample) y_aux = np.linspace(np.amin(data_aux[:, 1]), np.amax(data_aux[:, 1]), n_sample) X_aux, Y_aux = np.meshgrid(x_aux, y_aux) XX_aux = np.array([X_aux.ravel(), Y_aux.ravel()]).T fig = plt.figure(figsize=(12, 16)) plt.subplot(3, 2, 1) bw = 5e-2 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) plt.subplot(3, 2, 2) bw = 1e-2 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) plt.subplot(3, 2, 3) bw = 8e-3 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) plt.subplot(3, 2, 4) bw = 5e-3 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) plt.subplot(3, 2, 5) bw = 2e-3 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) plt.subplot(3, 2, 6) bw = 1e-3 kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data_aux) # kde.score_samples(data) # Z = np.exp(kde.score_samples(XX_aux)).reshape(X.shape) Z_log = kde.score_samples(XX_aux).reshape(X.shape) plt.scatter(data_aux[:, 0], data_aux[:, 1], s=5, alpha=0.2) ax1 = plt.contour(X_aux, Y_aux, Z_log) plt.colorbar(ax1, shrink=0.8) labeling(var_name1, var_name2, amp) plt.title('bw = ' + str(bw)) fig.suptitle('Cloud Closure: Kernel Density Estimate (gaussian)', fontsize=20) plt.savefig( os.path.join( fullpath_out, 'CloudClosure_figures', 'CC_' + var_name1 + '_' + var_name2 + '_' + str(time) + '_z' + str(np.int(z)) + 'm_KDE.png')) plt.close() print('KDE shapes: ', kde.score_samples(XX).shape, X.shape) print(kde.get_params()) return kde, kde
compilers = [ "Intel_data.txt", "GCC_data.txt", "Clang_data.txt", "Intel_avg_data.txt", "GCC_avg_data.txt", "Clang_avg_data.txt", "MKL_data.txt" ] for currentCompiler in compilers: data = np.genfromtxt(currentCompiler, skip_header=0) fig, ax = plt.subplots() ax.set_yscale('log') ax.hist(data, 30, normed=1, facecolor='green', alpha=0.75) #ax.axis([0, 25000, 0, 0.00015]) from sklearn.neighbors.kde import KernelDensity import numpy as np m1 = np.min(data) m2 = np.max(data) dm = m2 - m1 kde0 = KernelDensity(kernel='gaussian', bandwidth=dm / 30).fit(data.reshape(-1, 1)) X_plot = np.linspace(m1 - 0.2 * dm, m2 + 0.2 * dm, 1000).reshape(-1, 1) Dens0 = np.exp(kde0.score_samples( X_plot)) #score_samples возвращает логарифм плотности fig, ax = plt.subplots() ax.plot(X_plot, Dens0, color='blue') ax.set_yscale('log') ax.set_ylim(0.01, np.max(Dens0) * 1.1) #plt.show() save(currentCompiler, fmt='pdf') save(currentCompiler, fmt='png')
from IPython.display import Image from sklearn.neighbors.kde import KernelDensity f = open("crater_tuto") #For python 3 #crater = pickle.load(f,encoding='latin1') #For python 2 crater = pickle.load(f) f.close() plt.scatter(crater[:, 0], crater[:, 1], s=0.1) plt.show() #create 10 by 10 cubical complex: xval = np.arange(0, 10, 0.05) yval = np.arange(0, 10, 0.05) nx = len(xval) ny = len(yval) #Now we compute the values of the kernel density estimator on the center of each point of our grid. #The values will be stored in the array scores. kde = KernelDensity(kernel='gaussian', bandwidth=0.3).fit(crater) positions = np.array([[u, v] for u in xval for v in yval]) scores = -np.exp(kde.score_samples(X=positions)) #And subsequently construct a cubical complex based on the scores. cc_density_crater = gd.CubicalComplex(dimensions=[nx, ny], top_dimensional_cells=scores) # OPTIONAL pers_density_crater = cc_density_crater.persistence() plt = gd.plot_persistence_diagram(pers_density_crater).show()
msa_vectors.append(np.ndarray.flatten(tools.convert_samp_to_one_hot(msa[samp], n_aa))) msa_vectors = np.array(msa_vectors) print msa_vectors.shape #PCA pca = PCA(n_components=20) pca.fit(msa_vectors[1000:]) a_samps_pca = pca.transform(msa_vectors[1000:]) b_samps_pca = pca.transform(msa_vectors[:1000]) print a_samps_pca.shape #KDE # for bw in [.01, .1, 1., 10.]: for bw in [ 1.]: kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(a_samps_pca) # density_train = kde.score_samples(msa_vectors) print bw, kde.score(b_samps_pca) densities = kde.score_samples(b_samps_pca) # densities = np.ones(1000) #Scale densities to betw 0 and 1 min_density = np.min(densities) densities = densities - min_density + 1. weights = np.reciprocal(densities) max_weights = np.max(weights) weights = weights / max_weights
def get_groups(data, plot=True): bandwidth = 1.06 * np.std(data) * (len(data)**(-1 / 5) ) # Rosenblats rule of thumb s = np.linspace(0, np.max(data), 100) kde = KernelDensity(kernel='gaussian', bandwidth=max(bandwidth, 1e-10)).fit(data) e = kde.score_samples(s.reshape(-1, 1)) e = np.exp(e) mins, maxs = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0] groups = [] groups_idxs = [] groups_maxs = [] # maximum likelihood point if len(mins) == 1: groups.append(data[data < s[mins[0]]]) groups_idxs.append(np.where(data < s[mins[0]])[0]) groups_maxs.append( get_most_prob_value(s[:mins[0] + 1], e[:mins[0] + 1])) groups.append(data[data >= s[mins[0]]]) groups_idxs.append(np.where(data >= s[mins[0]])[0]) groups_maxs.append(get_most_prob_value(s[mins[0]:], e[mins[0]:])) elif len(mins) == 0: groups = [data] groups_idxs.append(np.arange(0, len(data), 1)) groups_maxs.append(get_most_prob_value(s, e)) else: for i in range(len(mins)): min_lp = s[mins[i]] if i == 0: # first one groups.append(data[data < min_lp]) groups_idxs.append(np.where(data < min_lp)[0]) groups_maxs.append( get_most_prob_value(s[:mins[0] + 1], e[:mins[0] + 1])) next_mi = s[mins[i + 1]] groups.append(data[(data >= min_lp) * (data < next_mi)]) groups_idxs.append( np.where((data >= min_lp) * (data < next_mi))[0]) groups_maxs.append( get_most_prob_value(s[mins[i]:mins[i + 1] + 1], e[mins[i]:mins[i + 1] + 1])) elif i == len(mins) - 1: # last one groups.append(data[data >= min_lp]) groups_idxs.append(np.where(data >= min_lp)[0]) groups_maxs.append( get_most_prob_value(s[mins[i]:], e[mins[i]:])) else: next_mi = s[mins[i + 1]] groups.append(data[(data >= min_lp) * (data < next_mi)]) groups_idxs.append( np.where((data >= min_lp) * (data < next_mi))[0]) groups_maxs.append( get_most_prob_value(s[mins[i]:mins[i + 1] + 1], e[mins[i]:mins[i + 1] + 1])) if plot: plt.plot(s, e) print(groups_maxs) print([len(g) for g in groups]) print([g[-5:] for g in groups]) plt.plot(s[maxs], e[maxs], 'go', s[mins], e[mins], 'ro') for i in range(len(mins)): if i == 0: # first one plt.plot(s[:mins[i] + 1], e[:mins[i] + 1]) elif i == len(mins) - 1: # last one plt.plot(s[mins[i]:], e[mins[i]:]) else: plt.plot(s[mins[i]:mins[i + 1] + 1], e[mins[i]:mins[i + 1] + 1]) for i, d in enumerate(data): plt.plot(d, 0.01, 'bo', markersize=10) plt.show(block=False) plt.pause(0.5) plt.close() for k, (idx_group, group) in enumerate(zip(groups_idxs, groups)): for idx, d in zip(idx_group, group): if data[idx] != d: print(data[idx]) print(d) print('sorcery') assert data[idx] == d print('ALP space : bins in [0 , {}], bandwidth={}, clusters={}'.format( np.max(data), bandwidth, [len(c) for c in groups])) return groups_idxs, groups, groups_maxs
def run(imagepath): t0 = time() image = io.imread(imagepath, as_gray=True) pyramid = pyramid_gaussian.get_pyramid(image) cfg.num_of_patches = cfg.num_test_patches # changing the number of patches for img, sc_name in zip(pyramid, cfg.scale_names): if sc_name == '0_12': init_flag = True ss = [] ns = 0 patches, centres = sample_patches.create_patches_randomly( img, subshape=ss, initialization=init_flag) f = extract_features.extractFeaturesForPatches(patches) # 0: femur # 1: cadera # 2: superior # 3: inferior d_tilde, f_tilde, c_tilde = build_matrices(cfg.bone_structures[3], sc_name, n_subs=ns) l = d_tilde.shape[0] // 2 # number of landmarks # Obtener los puntos f_hat = np.concatenate((f_tilde, f), axis=1) c_bar = compute_C_matrix(centres, l) c = np.tile(centres, (l, 1)) d = compute_D_matrix(f_hat, d_tilde, c_bar, l) data = d + c kde_ = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data.T) sc = kde_.score_samples(data.T) max = np.argmax(np.exp(sc)) shape = data[:, max] shape = np.reshape(shape, (l, 2)) a = shape[:, 0] b = shape[:, 1] a *= 8 b *= 8 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[5], b[5], 'b.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[12], b[12], 'b.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() a /= 4 b /= 4 else: for count in range(5): init_flag = False if count == 4: ss = shape[(4 * count):(4 * count + 5), :] else: ss = shape[(4 * count):(4 * count + 4), :] ns = count # if sc_name != '0_25': # ss = shape[0:4,:] patches, centres = sample_patches.create_patches_randomly( img, subshape=ss, initialization=init_flag) f = extract_features.extractFeaturesForPatches(patches) # 0: femur # 1: cadera # 2: superior # 3: inferior d_tilde, f_tilde, c_tilde = build_matrices( cfg.bone_structures[3], sc_name, n_subs=ns) l = d_tilde.shape[0] // 2 # number of landmarks # Obtener los puntos f_hat = np.concatenate((f_tilde, f), axis=1) c_bar = compute_C_matrix(centres, l) c = np.tile(centres, (l, 1)) d = compute_D_matrix(f_hat, d_tilde, c_bar, l) data = d + c kde_ = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data.T) sc = kde_.score_samples(data.T) max = np.argmax(np.exp(sc)) shape1 = data[:, max] shape1 = np.reshape(shape1, (l, 2)) # a = shape1[:, 0] # b = shape1[:, 1] # if sc_name == '0_25': # a *= 4 # b *= 4 # elif sc_name == '0_5': # a *= 2 # b *= 2 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() # if sc_name == '0_25': # a /= 2 # b /= 2 if count == 4: shape[(4 * count):(4 * count + 5), :] = shape1[0:5, :] * 2 else: shape[(4 * count):(4 * count + 4), :] = shape1[0:4, :] * 2 a = shape[:, 0] b = shape[:, 1] if sc_name == '0_25': a = a * 2 b = b * 2 if sc_name == '1': a = a / 2 b = b / 2 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[5], b[5], 'b.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[12], b[12], 'b.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() izquierdaX = np.copy(a) izquierdaY = np.copy(b) for img, sc_name in zip(pyramid, cfg.scale_names): if sc_name == '0_12': init_flag = True ss = [] ns = 0 patches, centres = sample_patches.create_patches_randomly( img, subshape=ss, initialization=init_flag) f = extract_features.extractFeaturesForPatches(patches) # 0: femur # 1: cadera # 2: superior # 3: inferior d_tilde, f_tilde, c_tilde = build_matrices(cfg.bone_structures[1], sc_name, n_subs=ns) l = d_tilde.shape[0] // 2 # number of landmarks # Obtener los puntos f_hat = np.concatenate((f_tilde, f), axis=1) c_bar = compute_C_matrix(centres, l) c = np.tile(centres, (l, 1)) d = compute_D_matrix(f_hat, d_tilde, c_bar, l) data = d + c kde_ = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data.T) sc = kde_.score_samples(data.T) max = np.argmax(np.exp(sc)) shape = data[:, max] shape = np.reshape(shape, (l, 2)) a = shape[:, 0] b = shape[:, 1] a *= 8 b *= 8 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[5], b[5], 'b.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() a /= 4 b /= 4 else: for count in range(5): init_flag = False if count == 4: ss = shape[(4 * count):(4 * count + 5), :] else: ss = shape[(4 * count):(4 * count + 4), :] ns = count # if sc_name != '0_25': # ss = shape[0:4,:] patches, centres = sample_patches.create_patches_randomly( img, subshape=ss, initialization=init_flag) f = extract_features.extractFeaturesForPatches(patches) # 0: femur # 1: cadera # 2: superior # 3: inferior d_tilde, f_tilde, c_tilde = build_matrices( cfg.bone_structures[1], sc_name, n_subs=ns) l = d_tilde.shape[0] // 2 # number of landmarks # Obtener los puntos f_hat = np.concatenate((f_tilde, f), axis=1) c_bar = compute_C_matrix(centres, l) c = np.tile(centres, (l, 1)) d = compute_D_matrix(f_hat, d_tilde, c_bar, l) data = d + c kde_ = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(data.T) sc = kde_.score_samples(data.T) max = np.argmax(np.exp(sc)) shape1 = data[:, max] shape1 = np.reshape(shape1, (l, 2)) # a = shape1[:, 0] # b = shape1[:, 1] # if sc_name == '0_25': # a *= 4 # b *= 4 # elif sc_name == '0_5': # a *= 2 # b *= 2 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() # if sc_name == '0_25': # a /= 2 # b /= 2 if count == 4: shape[(4 * count):(4 * count + 5), :] = shape1[0:5, :] * 2 else: shape[(4 * count):(4 * count + 4), :] = shape1[0:4, :] * 2 a = shape[:, 0] b = shape[:, 1] if sc_name == '0_25': a = a * 2 b = b * 2 if sc_name == '1': a = a / 2 b = b / 2 # fig, ax_ = plt.subplots() # ax_.imshow(image, cmap=plt.cm.gray) # ax_.plot(a, b, 'r.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[5], b[5], 'b.', markersize=8, mec='k', mew=0.3) # ax_.plot(a[12], b[12], 'b.', markersize=8, mec='k', mew=0.3) # ax_.axis('off') # plt.show() derechaX = np.copy(a) derechaY = np.copy(b) fig, ax_ = plt.subplots() ax_.imshow(image, cmap=plt.cm.gray) ax_.plot(a, b, 'r.', markersize=5, mec='k', mew=0.3) ax_.plot(izquierdaX, izquierdaY, 'r.', markersize=8, mec='k', mew=0.3) ax_.plot(derechaX, derechaY, 'r.', markersize=8, mec='k', mew=0.3) IT = (izquierdaY[12] - izquierdaY[5]) / (izquierdaX[12] - izquierdaX[5]) DT = (derechaY[12] - derechaY[5]) / (derechaX[12] - derechaX[5]) a1 = 2 * izquierdaX[5] - izquierdaX[12] a2 = 2 * izquierdaX[12] - izquierdaX[5] b1 = IT * (a1 - izquierdaX[5]) + izquierdaY[5] b2 = IT * (a2 - izquierdaX[5]) + izquierdaY[5] c1 = 2 * derechaX[5] - derechaX[12] c2 = 2 * derechaX[12] - derechaX[5] d1 = DT * (c1 - derechaX[5]) + derechaY[5] d2 = DT * (c2 - derechaX[5]) + derechaY[5] HT = (derechaY[12] - izquierdaY[12]) / (derechaX[12] - izquierdaX[12]) e1 = a1 e2 = c1 f1 = HT * (e1 - izquierdaX[12]) + izquierdaY[12] f2 = HT * (e2 - izquierdaX[12]) + izquierdaY[12] g1 = izquierdaX[5] g2 = HT * (g1 - izquierdaX[12]) + izquierdaY[12] h1 = derechaX[5] h2 = HT * (h1 - izquierdaX[12]) + izquierdaY[12] ax_.plot([e1, e2], [f1, f2], 'g', markersize=8, mec='k', mew=0.3) ax_.plot([c1, c2], [d1, d2], 'g', markersize=8, mec='k', mew=0.3) ax_.plot([a1, a2], [b1, b2], 'g', markersize=8, mec='k', mew=0.3) ax_.plot(izquierdaX[5], izquierdaY[5], 'b.', markersize=10, mec='k', mew=0.3) ax_.plot(izquierdaX[12], izquierdaY[12], 'b.', markersize=10, mec='k', mew=0.3) ax_.plot(derechaX[5], derechaY[5], 'b.', markersize=10, mec='k', mew=0.3) ax_.plot(derechaX[12], derechaY[12], 'b.', markersize=10, mec='k', mew=0.3) ax_.plot([g1, h1], [g2, h2], 'b.', markersize=10, mec='k', mew=0.3) nume1 = izquierdaY[5] * (izquierdaX[12] - g1) + izquierdaY[12] * ( g1 - izquierdaX[5]) + g2 * (izquierdaX[5] - izquierdaX[12]) deno1 = (izquierdaX[5] - izquierdaX[12]) * (izquierdaX[12] - g1) + ( izquierdaY[5] - izquierdaY[12]) * (izquierdaY[12] - g2) rati1 = nume1 / deno1 angl1 = math.atan(rati1) deg1 = (angl1 * 180) / math.pi if deg1 < 0: deg1 = deg1 + 180 print(deg1) nume2 = derechaY[5] * (derechaX[12] - h1) + derechaY[12] * ( h1 - derechaX[5]) + h2 * (derechaX[5] - derechaX[12]) deno2 = (derechaX[5] - derechaX[12]) * (derechaX[12] - h1) + ( derechaY[5] - derechaY[12]) * (derechaY[12] - h2) rati2 = nume2 / deno2 angl2 = math.atan(rati2) deg2 = (angl2 * 180) / math.pi if deg2 < 0: deg2 = deg2 + 180 deg2 = 180 - deg2 print(deg2) ax_.text(izquierdaX[12] - 20, izquierdaY[12] + 20, round(deg1, 2), color='yellow') ax_.text(derechaX[12] + 20, derechaY[12] + 20, round(deg2, 2), color='yellow') print('####\tNiña\tNiño') na = 'N' no = 'N' #1-2 if deg1 > 36 or deg2 > 36: na = 'L' if deg1 > 41.5 or deg2 > 41.5: na = 'G' if deg1 > 29 or deg2 > 31: no = 'L' if deg1 > 33 or deg2 > 35: no = 'G' print('1-2\t' + na + '\t' + no) #3-4 if deg1 > 31.5 or deg2 > 33: na = 'L' if deg1 > 36.5 or deg2 > 38.5: na = 'G' if deg1 > 28 or deg2 > 29: no = 'L' if deg1 > 32.5 or deg2 > 33.5: no = 'G' print('3-4\t' + na + '\t' + no) #5-6 if deg1 > 27.5 or deg2 > 29.5: na = 'L' if deg1 > 32 or deg2 > 34: na = 'G' if deg1 > 24.5 or deg2 > 27: no = 'L' if deg1 > 29 or deg2 > 31.5: no = 'G' print('5-6\t' + na + '\t' + no) #7-9 if deg1 > 25.5 or deg2 > 27: na = 'L' if deg1 > 29.5 or deg2 > 31.5: na = 'G' if deg1 > 24.5 or deg2 > 25.5: no = 'L' if deg1 > 29 or deg2 > 29.5: no = 'G' print('7-9\t' + na + '\t' + no) #2a-3a if deg1 > 22 or deg2 > 23.5: na = 'L' if deg1 > 25.5 or deg2 > 27: na = 'G' if deg1 > 21 or deg2 > 22.5: no = 'L' if deg1 > 25 or deg2 > 27: no = 'G' print('2a-3a\t' + na + '\t' + no) #3a-5a if deg1 > 18 or deg2 > 21: na = 'L' if deg1 > 25.5 or deg2 > 25.5: na = 'G' if deg1 > 19 or deg2 > 20: no = 'L' if deg1 > 23.5 or deg2 > 24: no = 'G' print('3a-5a\t' + na + '\t' + no) ax_.axis('off') plt.show() ''' l = d_tilde.shape[0] // 2 # number of landmarks # Composed matrix f_hat = np.concatenate((f_tilde, f), axis=1) c_bar = compute_C_matrix(centres, l) c = np.tile(centres, (l, 1)) d = compute_D_matrix(f_hat, d_tilde, c_bar, l) positions_ = d + c density_estimation(positions_, img,imagepath) ''' '''
def _evaluate_vec(self, opts, step, real_points, fake_points, validation_fake_points, prefix=''): """Compute the average log-likelihood and the Coverage metric. Coverage metric is defined in arXiv paper. It counts a mass of true data covered by the 95% quantile of the model density. """ # Estimating density with KDE dist = fake_points[:-1] - fake_points[1:] dist = dist * dist dist = np.sqrt(np.sum(dist, axis=(1, 2, 3))) bandwidth = np.median(dist) num_real = len(real_points) num_fake = len(fake_points) if validation_fake_points is not None: max_score = -1000000. num_val = len(validation_fake_points) b_grid = bandwidth * (2. ** (np.arange(14) - 7.)) for _bandwidth in b_grid: kde = KernelDensity(kernel='gaussian', bandwidth=_bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) score = np.mean(kde.score_samples( np.reshape(validation_fake_points, [num_val, -1]))) if score > max_score: # logging.debug("Updating bandwidth to %.4f" # " with likelyhood %.2f" % (_bandwidth, score)) bandwidth = _bandwidth max_score = score kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth) kde.fit(np.reshape(fake_points, [num_fake, -1])) # Computing Coverage, refer to Section 4.3 of arxiv paper model_log_density = kde.score_samples( np.reshape(fake_points, [num_fake, -1])) # np.percentaile(a, 10) returns t s.t. np.mean( a <= t ) = 0.1 threshold = np.percentile(model_log_density, 5) real_points_log_density = kde.score_samples( np.reshape(real_points, [num_real, -1])) ratio_not_covered = np.mean(real_points_log_density <= threshold) log_p = np.mean(real_points_log_density) C = 1. - ratio_not_covered logging.info('Evaluating: log_p=%.3f, C=%.3f' % (log_p, C)) return log_p, C
points_fg = np.array([img_input[x, y, :] for (x, y) in xy_fg]) points_bg = np.array([img_input[x, y, :] for (x, y) in xy_bg]) fig, axes = plt.subplots(nrows=2, ncols=1) sns.distplot(points_fg[:, 0], ax=axes[0], color='r') sns.distplot(points_fg[:, 1], ax=axes[0], color='g') sns.distplot(points_fg[:, 2], ax=axes[0], color='b') sns.distplot(points_bg[:, 0], ax=axes[1], color='r') sns.distplot(points_bg[:, 1], ax=axes[1], color='g') sns.distplot(points_bg[:, 2], ax=axes[1], color='b') #расчет масок - самая долгая операция kde_fg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_fg) kde_bg = KernelDensity(kernel='gaussian', bandwidth=1, algorithm='kd_tree', leaf_size=100).fit(points_bg) score_kde_fg = np.zeros(img_input.shape[:2]) score_kde_bg = np.zeros(img_input.shape[:2]) likelihood_fg = np.zeros(img_input.shape[:2]) coodinates = it.product(range(score_kde_fg.shape[0]), range(score_kde_fg.shape[1])) for x, y in tqdm_notebook(coodinates, total=np.prod(score_kde_fg.shape)): score_kde_fg[x, y] = np.exp(kde_fg.score(img_input[x, y, :].reshape(1, -1)))
import numpy import pandas from sklearn.neighbors.kde import KernelDensity import matplotlib.pyplot as plt from scipy.stats import norm df = pandas.read_csv('C:/Udemy/SKLEARN-Python/004_visits_per_day.csv', index_col=False, header=0) X_plot = numpy.linspace(-2, 2, 1000)[:, numpy.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(df.values) log_dens = kde.score_samples(X_plot) #plt.hist(df.values, bins=numpy.linspace(0, 1, 20), fc='#AAAAFF', normed=True) plt.hist(df.values, bins=numpy.linspace(0, 1, 10), fc='#AAAAFF', normed=True) plt.plot(X_plot, numpy.exp(log_dens)) plt.show() ######################################################################################### df = pandas.read_csv('C:/Udemy/SKLEARN-Python/004_visits_per_day.csv', index_col=False, header=0) X_plot = numpy.linspace(-20, 1, 1000)[:, numpy.newaxis] kde = KernelDensity(kernel='gaussian', bandwidth=1.4).fit(numpy.log(df.values)) # here I needed to change the bandwidth value to 1.4 log_dens = kde.score_samples(X_plot) plt.hist(numpy.log(df.values),
for chunk_id in bar(range(0, chunk_nu)): col = chunks.get_chunk()[col_name] ys = responses[chunk_id * max_chunk_size:chunk_id * max_chunk_size + col.shape[0]] for i in range(0, col.shape[0], 1): value = col.iloc[i] y = ys[i] if value != value: cnts[y]['nan'] += 1 else: cnts[y]['nu'].append(value) cnts[0]['nu'] = np.asarray(cnts[0]['nu']).reshape(-1, 1) cnts[1]['nu'] = np.asarray(cnts[1]['nu']).reshape(-1, 1) print('cal kde for 0...') if cnts[0]['nu'].size > 0: cnts[0]['kde'] = KernelDensity(kernel='gaussian').fit( cnts[0]['nu']) print('cal kde for 1...') if cnts[1]['nu'].size > 0: cnts[1]['kde'] = KernelDensity(kernel='gaussian').fit( cnts[1]['nu']) utils.save_variable(cnts, file_path) break except ValueError: print('get ValueError. Restart again.') #%%
def plot_solid_liquid_ratio(temperature_next, strain_lst, nve_run_time_steps, project_parameter, debug_plot=True): cna_str = project_parameter['crystalstructure'].upper() ratio_lst = [] for strain in strain_lst: job_name = get_nve_job_name( temperature_next=temperature_next, strain=strain, steps_lst=project_parameter['nve_run_time_steps_lst'], nve_run_time_steps=nve_run_time_steps) ham_nve = project_parameter['project'].load(job_name) struct = ham_nve.get_structure().center_coordinates_in_unit_cell() cna = struct.analyse_ovito_cna_adaptive(mode='str') bcc_count = sum(cna == 'BCC') fcc_count = sum(cna == 'FCC') hcp_count = sum(cna == 'HCP') if (cna_str == 'BCC' and bcc_count > fcc_count and bcc_count > hcp_count) or \ (cna_str == 'FCC' and fcc_count > bcc_count and fcc_count > hcp_count) or \ (cna_str == 'HCP' and hcp_count > bcc_count and hcp_count > fcc_count): # plt.figure(figsize=(16,12)) bandwidth = (struct.get_volume() / len(struct))**(1.0 / 3.0) kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit( struct.positions[:, 2][cna == cna_str].reshape(-1, 1)) z_range = np.linspace(struct.positions[:, 2].min(), struct.positions[:, 2].max(), 1000) sample = kde.score_samples(z_range.reshape(-1, 1)) gaussian_funct = np.exp(sample) / np.exp(sample).max() z_range_above_limit = z_range[np.where(gaussian_funct > 0.1)] z_range_below_limit = z_range[np.where(gaussian_funct < 0.1)] if len(z_range_above_limit) != 0: ratio_above = (np.max(z_range_above_limit)-np.min(z_range_above_limit)) / \ (np.max(z_range)-np.min(z_range)) else: ratio_above = 1.0 if len(z_range_below_limit) != 0: ratio_below = 1 - (np.max(z_range_below_limit)-np.min(z_range_below_limit)) / \ (np.max(z_range)-np.min(z_range)) else: ratio_below = 0.0 if ratio_below == 0.0: ratio = ratio_above elif ratio_above == 1.0: ratio = ratio_below else: ratio = np.min([ratio_below, ratio_above]) ratio_lst.append(ratio) else: z_range = None gaussian_funct = None z_range_above_limit = None ratio = None ratio_lst.append(0.0) if debug_plot: plt.title('strain: ' + str(strain)) plt.xlabel('position z') plt.ylabel('position x') plt.plot(struct.positions[:, 2], struct.positions[:, 0], 'o', label='all') plt.plot(struct.positions[:, 2][cna == 'BCC'], struct.positions[:, 0][cna == 'BCC'], 'x', label='BCC') plt.plot(struct.positions[:, 2][cna == 'FCC'], struct.positions[:, 0][cna == 'FCC'], 'x', label='FCC') plt.plot(struct.positions[:, 2][cna == 'HCP'], struct.positions[:, 0][cna == 'HCP'], 'x', label='HCP') cna_str_lst = struct.positions[:, 2][cna == cna_str] if len(cna_str_lst) != 0: plt.axvline(cna_str_lst.max(), color='red') plt.axvline(cna_str_lst.min(), color='red') plt.legend() plt.show() plt.xlabel('Position in z') plt.ylabel('kernel density score') plt.title('strain: ' + str(strain)) if z_range is not None: plt.plot(z_range, gaussian_funct, label=cna_str) plt.axvline(np.min(z_range_above_limit), color='black', linestyle='--', label='ratio: ' + str(ratio)) plt.axvline(np.max(z_range_above_limit), color='black', linestyle='--') plt.axhline(0.1, color='red') plt.legend() plt.show() return ratio_lst