def get_kd(model, X_train, Y_train, X_test, X_test_noisy, X_test_adv): """ Get kernel density scores :param model: :param X_train: :param Y_train: :param X_test: :param X_test_noisy: :param X_test_adv: :return: artifacts: positive and negative examples with kd values, labels: adversarial (label: 1) and normal/noisy (label: 0) examples """ # Get deep feature representations print('Getting deep feature representations...') X_train_features = get_deep_representations(model, X_train, batch_size=args.batch_size) X_test_normal_features = get_deep_representations( model, X_test, batch_size=args.batch_size) X_test_noisy_features = get_deep_representations( model, X_test_noisy, batch_size=args.batch_size) X_test_adv_features = get_deep_representations(model, X_test_adv, batch_size=args.batch_size) # Train one KDE per class print('Training KDEs...') class_inds = {} for i in range(Y_train.shape[1]): class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0] kdes = {} warnings.warn( "Using pre-set kernel bandwidths that were determined " "optimal for the specific CNN models of the paper. If you've " "changed your model, you'll need to re-optimize the " "bandwidth.") print('bandwidth %.4f for %s' % (BANDWIDTHS[args.dataset], args.dataset)) for i in range(Y_train.shape[1]): kdes[i] = KernelDensity(kernel='gaussian', bandwidth=BANDWIDTHS[args.dataset]) \ .fit(X_train_features[class_inds[i]]) # Get model predictions print('Computing model predictions...') preds_test_normal = model.predict_classes(X_test, verbose=0, batch_size=args.batch_size) preds_test_noisy = model.predict_classes(X_test_noisy, verbose=0, batch_size=args.batch_size) preds_test_adv = model.predict_classes(X_test_adv, verbose=0, batch_size=args.batch_size) # Get density estimates print('computing densities...') densities_normal = score_samples(kdes, X_test_normal_features, preds_test_normal) densities_noisy = score_samples(kdes, X_test_noisy_features, preds_test_noisy) densities_adv = score_samples(kdes, X_test_adv_features, preds_test_adv) print("densities_normal:", densities_normal.shape) print("densities_adv:", densities_adv.shape) print("densities_noisy:", densities_noisy.shape) ## skip the normalization, you may want to try different normalizations later ## so at this step, just save the raw values # densities_normal_z, densities_adv_z, densities_noisy_z = normalize( # densities_normal, # densities_adv, # densities_noisy # ) densities_pos = densities_adv densities_neg = np.concatenate((densities_normal, densities_noisy)) artifacts, labels = merge_and_generate_labels(densities_pos, densities_neg) return artifacts, labels
dist = np.sqrt( np.sum(np.square(y_reconstructed - test_latents).reshape( len(test_latents), -1), axis=1)) sns.distplot(dist) pred_save(dist, PRED_FOLDER + 'prediction_unet_vae_pca_reconstruced.csv') # %% from sklearn.manifold import TSNE import matplotlib.pyplot as plt print('TSNE fitting...') tsne = TSNE(n_components=2, random_state=SEED, verbose=True) y_TSNE = tsne.fit_transform(test_latents) plt.scatter(y_TSNE[:, 0], y_TSNE[:, 1], s=1) rmse_tsne_test = np.sqrt( np.square(y_TSNE[:, 0] - np.mean(y_TSNE[:, 0])) + np.square(y_TSNE[:, 1] - np.mean(y_TSNE[:, 1]))) sns.distplot(rmse_tsne_test) pred_save(rmse_tsne_test, PRED_FOLDER + 'prediction_unet_vae_tsne_rmse.csv') # %% from sklearn.neighbors import KernelDensity kd = KernelDensity() kd.fit(test_latents) score = [kd.score(i.reshape(1, -1)) for i in test_latents] score = score - np.min(score) sns.distplot(score) pred_save(score, PRED_FOLDER + 'prediction_unet_vae_latentkd.csv') # %%
def one_cut(self, x): #~ x = x[x>(thresh-threshold_margin)] #~ kde = scipy.stats.gaussian_kde(x, bw_method=kde_bandwith) #~ d = kde(bins) #~ d /= np.sum(d) kde = KernelDensity(kernel='gaussian', bandwidth=self.kde_bandwith) d = kde.fit(x[:, np.newaxis]).score_samples(self.bins[:, np.newaxis]) d = np.exp(d) #local max d0, d1, d2 = d[:-2], d[1:-1], d[2:] #~ ind_max, = np.nonzero((d0<d1) & (d2<d1)) ind_max, = np.nonzero((d0<d1) & (d2<=d1)) ind_max += 1 #~ ind_min, = np.nonzero((d0>d1) & (d2>d1)) ind_min, = np.nonzero((d0>d1) & (d2>=d1)) ind_min += 1 #~ print('ind_max', ind_max) #~ print('ind_min', ind_min) #~ fig, ax = plt.subplots() #~ ax.plot(d) #~ ax.plot(ind_max, d[ind_max], ls='None', marker='o', color='r') #~ ax.plot(ind_min, d[ind_min], ls='None', marker='o', color='g') #~ plt.show() if ind_max.size>0: if ind_min.size==0: assert ind_max.size==1, 'Super louche pas de min mais plusieur max' ind_min = np.array([0, self.bins.size-1], dtype='int64') else: ind_min = ind_min.tolist() if ind_max[0]<ind_min[0]: ind_min = [0] + ind_min if ind_max[-1]>ind_min[-1]: ind_min = ind_min + [ self.bins.size-1] ind_min = np.array(ind_min, dtype='int64') #Loop reject small rebounce minimam/maxima #~ print('loop1') ind_max_cleaned = ind_max.tolist() ind_min_cleaned = ind_min.tolist() while True: rejected_minima = None rejected_maxima = None #~ print('ind_min_cleaned', ind_min_cleaned, self.bins[ind_min_cleaned]) #~ print('ind_max_cleaned', ind_max_cleaned, self.bins[ind_max_cleaned]) for i, ind in enumerate(ind_min_cleaned[1:-1]): prev_max = ind_max_cleaned[i] next_max = ind_max_cleaned[i+1] delta_density_prev = d[prev_max] - d[ind] delta_density_next = d[next_max] - d[ind] if min(delta_density_prev, delta_density_next)<d[ind]*self.minima_rejection_factor: rejected_minima = ind if delta_density_prev<delta_density_next: rejected_maxima = prev_max else: rejected_maxima = next_max break if rejected_minima is None: break ind_max_cleaned.remove(rejected_maxima) ind_min_cleaned.remove(rejected_minima) #~ print('loop2') #loop reject density with too few spikes while True: rejected_minima = None rejected_maxima = None #~ print('ind_min_cleaned', ind_min_cleaned, self.bins[ind_min_cleaned]) #~ print('ind_max_cleaned', ind_max_cleaned, self.bins[ind_max_cleaned]) for i, ind in enumerate(ind_min_cleaned[:-1]): next_min = ind_min_cleaned[i+1] n = np.sum(d[ind:next_min]*self.binsize) * x.size #~ print('n', n, self.bins[ind], self.bins[next_min], np.sum(d)) if n<self.nb_min: rejected_maxima = ind_max_cleaned[i] if d[ind]<d[next_min]: rejected_minima = next_min else: rejected_minima = ind break if rejected_minima is None: break ind_max_cleaned.remove(rejected_maxima) ind_min_cleaned.remove(rejected_minima) #~ print('loop3') #TODO eliminate first avec meme critere loop 1 if len(ind_min_cleaned)>=2: den_min0 = d[ind_min_cleaned[0]] den_max0 = d[ind_max_cleaned[0]] if (den_max0-den_min0)<den_min0*self.minima_rejection_factor: ind_min_cleaned = ind_min_cleaned[1:] ind_max_cleaned = ind_max_cleaned[1:] #~ print('loop4') if len(ind_min_cleaned)>=2: if self.bins[ind_max_cleaned[0]]<self.threshold+self.margin_first_max: ind_min_cleaned = ind_min_cleaned[1:] ind_max_cleaned = ind_max_cleaned[1:] if len(ind_min_cleaned)>=2: #TODO here criterium for best return self.bins[ind_min_cleaned[-2]], self.bins[ind_min_cleaned[-1]], d else: return None, None, d
r11_high = np.max(line_X11[lindx]) else: r11_low = -1.0 r11_high = -1.0 if i == 0: r11_low0 = r11_low r11_high0 = r11_high print(' lz region for 1:1 =', r11_low, r11_high) ax[i].add_patch( patches.Rectangle((r11_low, ymin_hist), r11_high-r11_low, \ ymax_hist-ymin_hist, facecolor='grey', fill=True, alpha=0.5)) # histogram # ax[i].hist(lzs, bins = lz_bins, fc='#AAAAFF', density=True) # KDE kde = KernelDensity(kernel='epanechnikov', \ bandwidth=hlz).fit(lzs.reshape(-1, 1), sample_weight=probs) log_dens = kde.score_samples(lz_bins.reshape(-1, 1)) ax[i].plot(lz_bins, np.exp(log_dens), color='black') # set tick params ax[i].tick_params(labelsize=16, color='k', direction="in") ax[i].set_xlim(lzmin_hist, lzmax_hist) ax[i].set_ylim(ymin_hist, ymax_hist) if i == njrsamp - 1: ax[i].set_xticks([0.5, 1.0, 1.5]) if i == 0: ax[i].set_ylabel(r"dN($0.03<{\rm J}_{\rm R}<0.1$)", fontsize=14) if i == 1: ax[i].set_ylabel(r"dN($0.01<{\rm J}_{\rm R}<0.02$)", fontsize=14) ax[i].set_xticks([0.5, 1.0, 1.5])
n_features = [15,25,30] print('===============================================================================================') for i in range(len(n_features)): #-------------------------Reducing the number of dimentions using PCA--------------------- pca = PCA(n_components=n_features[i], whiten=False) data = pca.fit_transform(digits.data) #-------------Performing Grid Search Cross-Validation to optimize the bandwidth----------- print('Performing Grid Search Cross-Validation to optimize the bandwidth') params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params, cv=5) grid.fit(data) print('Best Bandwidth using {} features: {}'.format(n_features[i],grid.best_estimator_.bandwidth)) #--------------------------Perform KDE using this best Bandwidth-------------------------- kde = grid.best_estimator_ # ---------------------Sample 48 new data points from estimated density------------------- new_data = kde.sample(48, random_state=0) new_data = pca.inverse_transform(new_data) # turn data into a 6x8 grid new_data = new_data.reshape((6, 8, -1)) real_data = digits.data[:48].reshape((6, 8, -1))
# np.place(vec, abs(vec)<1, 1) # np.place(vec, abs(vec)>1, 0) # return vec # # def densite_estime(X, data, h): # N = data.shape[0] # return((1/2*N*h) * sum(is_within_the_hypercube((X-data)/h))) # # def hist_noyaux_boxcar(vec_X, data, bandwidth): # Y_densite = [] # for i in range(vec_X.shape[0]): # Y_densite.append(densite_estime(vec_X[i,:], data, bandwidth)) # return Y_densite/sum(Y_densite) X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] kde_1 = KernelDensity(kernel='tophat', bandwidth=0.3).fit(X1) kde_2 = KernelDensity(kernel='tophat', bandwidth=1).fit(X1) kde_3 = KernelDensity(kernel='tophat', bandwidth=2).fit(X1) kde_4 = KernelDensity(kernel='tophat', bandwidth=5).fit(X1) kde_1_2 = KernelDensity(kernel='tophat', bandwidth=0.3).fit(X2) kde_2_2 = KernelDensity(kernel='tophat', bandwidth=1).fit(X2) kde_3_2 = KernelDensity(kernel='tophat', bandwidth=2).fit(X2) kde_4_2 = KernelDensity(kernel='tophat', bandwidth=5).fit(X2) fig, ax = pyplot.subplots(2, 1, sharex=True, sharey=True) fig.subplots_adjust(hspace=0.4, wspace=0.05) ax[0].plot(X_plot, np.exp(kde_1.score_samples(X_plot)), label="bandwidth=0.3") ax[0].plot(X_plot, np.exp(kde_2.score_samples(X_plot)), label="bandwidth=1") ax[0].plot(X_plot, np.exp(kde_3.score_samples(X_plot)), label="bandwidth=2")
weights=model.layers[4].get_weights())) encoder_replica.add(MaxPooling2D(pool_size=(2, 2), padding='same')) encoder_replica.summary() # The SKLearn kernel density function only works with 1D arrays so we need to flatten the tensors created by the encoder encoded_images = encoder_replica.predict_generator(train_generator) encoded_images_flat = [np.reshape(img, (27)) for img in encoded_images] validation_encoded = encoder_replica.predict_generator(validation_generator) val_enc_flat = [np.reshape(img, (27)) for img in validation_encoded] anom_encoded = encoder_replica.predict_generator(anomaly_generator) anom_enc_flat = [np.reshape(img, (27)) for img in anom_encoded] # Kernel Density Estimation of the encoded vectors kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(encoded_images_flat) training_density_scores = kde.score_samples(encoded_images_flat) validation_density_scores = kde.score_samples(val_enc_flat) anomaly_density_scores = kde.score_samples(anom_enc_flat) # Plotting the density distributions of the training (normal), validation (normal) and anomalous images # Ideally we want to see high separation between the normal and anomalous classes plt.figure(figsize=(10, 7)) plt.title('Distribution of Density Scores') plt.hist(training_density_scores, 12, alpha=0.5, label='Training Normal') plt.hist(validation_density_scores, 12, alpha=0.5, label='Validation Normal') plt.hist(anomaly_density_scores, 12, alpha=0.5, label='Anomalies') plt.legend(loc='upper right') plt.xlabel('Density Score') plt.show()
# Data Selection no_transaction = X[:, 1] # Frequency sum_amounts = X[:, 2] # Money # Plot a 1D density example N = 100 np.random.seed(1) N = no_transaction.shape[0] X = no_transaction[:, np.newaxis] #np.random.normal(0, 1, 0.3 * N)[:, np.newaxis] X_plot = np.linspace(np.min(X), np.max(X), 1000)[:, np.newaxis] fig, ax = plt.subplots() for kernel in ['gaussian', 'tophat', 'epanechnikov']: kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper left') #ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') #ax.set_xlim(-4, 9) #ax.set_ylim(-0.02, 0.4) plt.show()
# load decoder decoder_name = encoder_name.replace('encoder', 'decoder') with open(decoder_name) as fl: decoder = model_from_yaml(fl) decoder.load_weights(decoder_name[:-4] + 'h5') target_seqs = decoder.predict(target_latents, batch_size=1000) generated_seqs = target_seqs[:,::10,:] X_test=X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2]) generated_seqs=generated_seqs.reshape(generated_seqs.shape[0],generated_seqs.shape[1]*generated_seqs.shape[2]) if args.bandwidth is None: ##grid search params = {'bandwidth': np.logspace(-1, 0., 10)} grid = GridSearchCV(KernelDensity(), params, cv=3, verbose=1) X_search = np.random.permutation(X)[:10000,::10,:] X_search = X_search.reshape(X_search.shape[0],X_search.shape[1]*X_search.shape[2]) grid_result = grid.fit(X_search) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) for params, mean_score, scores in grid_result.grid_scores_: print("scores.mean:%f (score.std:%f) with: %r" % (scores.mean(), scores.std(), params)) #bandwidth = 0.25 bandwidth = grid_result.best_params_ else: bandwidth = args.bandwidth ParzenWindow = KernelDensity(bandwidth=bandwidth, algorithm='auto', kernel='gaussian', metric='euclidean') print "shape of generated_seqs is {}".format(generated_seqs.shape)
def doublet_finder(ds: loompy.LoomConnection, use_pca: bool = False, proportion_artificial: float = 0.20, fixed_th: float = None, k: int = None, name: object = "tmp", qc_dir: object = ".", graphs: bool = True, max_th: float = 1) -> np.ndarray: # Step 1: Generate artificial doublets from input logging.debug("Creating artificial doublets") n_real_cells = ds.shape[1] n_doublets = int(n_real_cells / (1 - proportion_artificial) - n_real_cells) doublets = np.zeros((ds.shape[0], n_doublets)) for i in range(n_doublets): a = np.random.choice(ds.shape[1]) b = np.random.choice(ds.shape[1]) doublets[:, i] = ds[:, a] + ds[:, b] data_wdoublets = np.concatenate((ds[:, :], doublets), axis=1) logging.debug("Feature selection and dimensionality reduction") genes = FeatureSelectionByVariance(2000).fit(ds) if use_pca: # R function uses log2 counts/million f = np.divide(data_wdoublets.sum(axis=0), 10e6) norm_data = np.divide(data_wdoublets, f) norm_data = np.log(norm_data + 1) pca = PCA(n_components=50).fit_transform(norm_data[genes, :].T) else: data = sparse.coo_matrix(data_wdoublets[genes, :]).T hpf = HPF(k=64, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False) hpf.fit(data) theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T if k is None: k = int(np.min([100, ds.shape[1] * 0.01])) logging.info(f"Initialize NN structure with k = {k}") if use_pca: knn_result = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4) knn_result.fit(pca) knn_dist, knn_idx = knn_result.kneighbors(X=pca, return_distance=True) num = ds.shape[1] knn_result1 = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4) knn_result1.fit(pca[0:num, :]) knn_dist1, knn_idx1 = knn_result1.kneighbors(X=pca[num + 1:, :], n_neighbors=10) knn_dist_rc, knn_idx_rc = knn_result1.kneighbors(X=pca[0:num, :], return_distance=True) else: knn_result = NNDescent(data=theta, metric=jensen_shannon_distance) knn_idx, knn_dist = knn_result.query(theta, k=k) num = ds.shape[1] knn_result1 = NNDescent(data=theta[0:num, :], metric=jensen_shannon_distance) knn_idx1, knn_dist1 = knn_result1.query(theta[num + 1:, :], k=10) knn_idx_rc, knn_dist_rc = knn_result1.query(theta[0:num, :], k=k) dist_th = np.mean(knn_dist1.flatten()) + 1.64 * np.std(knn_dist1.flatten()) doublet_freq = np.logical_and(knn_idx > ds.shape[1], knn_dist < dist_th) doublet_freq_A = doublet_freq[ds.shape[1]:ds.shape[1] + n_doublets, :] mean1 = doublet_freq_A.mean(axis=1) mean2 = doublet_freq_A[:, 0:int(np.ceil(k / 2))].mean(axis=1) doublet_score_A = np.maximum(mean1, mean2) doublet_freq = doublet_freq[0:ds.shape[1], :] mean1 = doublet_freq.mean(axis=1) mean2 = doublet_freq[:, 0:int(np.ceil(k / 2))].mean(axis=1) doublet_score = np.maximum(mean1, mean2) doublet_flag = np.zeros(ds.shape[1], int) doublet_th1 = 1 doublet_th2 = 1 doublet_th = 1 #Infer TH from the data or use fixed TH # instantiate and fit the KDE model kde = KernelDensity(bandwidth=0.1, kernel='gaussian') kde.fit(doublet_score_A[:, None]) # score_samples returns the log of the probability density xx = np.linspace(doublet_score_A.min(), doublet_score_A.max(), len(doublet_score_A)).reshape(-1, 1) logprob = kde.score_samples(xx) if fixed_th is not None: doublet_th = float(fixed_th) else: #Check if the distribution is bimodal intervals = UniDip(np.exp(logprob)).run() if (len(intervals) > 1): kmeans = KMeans(n_clusters=2).fit( doublet_score_A.reshape(len(doublet_score_A), 1)) high_cluster = np.where( kmeans.cluster_centers_ == max(kmeans.cluster_centers_))[0][0] doublet_th1 = np.around(np.min( doublet_score_A[kmeans.labels_ == high_cluster]), decimals=3) #0.5% for every 1000 cells - the rate of detectable doublets by 10X doublet_th2 = np.percentile(doublet_score, 100 - (5e-4 * ds.shape[1])) doublet_th2 = np.around(doublet_th2, decimals=3) #The TH shouldn't be higher than indicated if doublet_th2 > max_th: doublet_th2 = max_th if doublet_th1 > max_th: doublet_th1 = max_th if (len(np.where(doublet_score >= doublet_th1)[0]) > (len(np.where(doublet_score >= doublet_th2)[0]))): doublet_th = doublet_th2 else: doublet_th = doublet_th1 doublet_flag[doublet_score >= doublet_th] = 1 #Calculate the score for the cells that are nn of the marked doublets if use_pca: pca_rc = pca[0:num, :] knn_dist1_rc, knn_idx1_rc = knn_result1.kneighbors( X=pca_rc[doublet_flag == 1, :], n_neighbors=10, return_distance=True) else: theta_rc = theta[0:num, :] knn_idx1_rc, knn_dist1_rc = knn_result1.query( theta_rc[doublet_flag == 1, :], k=10) dist_th = np.mean( knn_dist1_rc.flatten()) + 1.64 * np.std(knn_dist1_rc.flatten()) doublet2_freq = np.logical_and(doublet_flag[knn_idx_rc] == 1, knn_dist_rc < dist_th) doublet2_nn = knn_dist_rc < dist_th doublet2_score = doublet2_freq.sum(axis=1) / doublet2_nn.sum(axis=1) doublet_flag[np.logical_and(doublet_flag == 0, doublet2_score >= doublet_th / 2)] = 2 if graphs: if (use_pca): ds.ca.PCA = pca[0:ds.shape[1], :] else: ds.ca.HPF = theta[0:ds.shape[1], :] doublets_plots.plot_all(ds, out_file=os.path.join(qc_dir + "/" + name + "_doublets.png"), labels=doublet_flag, doublet_score_A=doublet_score_A, logprob=logprob, xx=xx, score1=doublet_th1, score2=doublet_th2, score=doublet_th) logging.info( f"Doublet fraction: {100*len(np.where(doublet_flag>0)[0])/ds.shape[1]:.2f}%, {len(np.where(doublet_flag>0)[0])} cells. \n\t\t\t(Expected detectable doublet fraction: {(5e-4*ds.shape[1]):.2f}%)" ) return doublet_score, doublet_flag
def plot_pfam_familysizes(pfam_df, plot_dir): # define counts for PFAmilies with and without annotated PDB structures struct = np.log(pfam_df.query('nr_structures > 0')['nr_sequences'].values) no_struct = np.log( pfam_df.query('nr_structures == 0')['nr_sequences'].values) # define grid for kernel density estimation x_grid = np.linspace(np.min(struct.tolist() + no_struct.tolist()), np.max(struct.tolist() + no_struct.tolist()), 500) bandwidth = 0.3 #define colors for struct and no_struct colors = ['rgb(22, 96, 167)', 'rgb(205, 12, 24)'] colors = ['rgb(170, 221, 172)', 'rgb(3, 177, 74)'] colors = ['rgb(170, 170, 170)', 'rgb(0,0,0)'] # kernel density estimate for Pfamilies with annotated structure kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(struct.reshape(-1, 1)) struct_density = np.exp(kde.score_samples(x_grid.reshape(-1, 1))) struct_density_normalized_counts = len(struct) / np.sum( struct_density) * struct_density # kernel density estimate for Pfamilies without annotated structure kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(no_struct.reshape(-1, 1)) nostruct_density = np.exp(kde.score_samples(x_grid.reshape(-1, 1))) nostruct_density_normalized_counts = len(no_struct) / np.sum( nostruct_density) * nostruct_density ### add plot traces for struct trace_kde_struct = go.Scatter( x=x_grid, y=struct_density_normalized_counts, mode='lines', line=dict(color=colors[0], width=4), name="<b>with</b> structural <br>annotation (" + str(len(struct)) + ")") ### add plot traces for struct trace_kde_nostruct = go.Scatter( x=x_grid, y=nostruct_density_normalized_counts, mode='lines', line=dict(color=colors[1], width=4), name="<b>lacking</b> structural <br>annotation (" + str(len(no_struct)) + ")") # add vertical line for median of family size for families with structures median_struct = np.median(struct) trace_median_struct = go.Scatter( x=[median_struct, median_struct], y=[ 0, np.max([ np.max(struct_density_normalized_counts), np.max(nostruct_density_normalized_counts) ]) ], mode='lines+text', name="median family size", textfont=dict(family='sans serif', size=18, color=colors[0]), text=[ "", " median: " + str(np.round(np.exp(median_struct), decimals=3)) ], textposition='right', line=dict(color=colors[0], width=4, dash='dash'), showlegend=False) # add vertical line for median of family size for families with NO structures median_nostruct = np.median(no_struct) trace_median_nostruct = go.Scatter( x=[median_nostruct, median_nostruct], y=[ 0, np.max([ np.max(struct_density_normalized_counts), np.max(nostruct_density_normalized_counts) ]) ], mode='lines+text', name="median family size", line=dict(color=colors[1], width=4, dash='dash'), textfont=dict(family='sans serif', size=18, color=colors[1]), text=[ "", "median: " + str(np.round(np.exp(median_nostruct), decimals=3)) + " " ], textposition="left", showlegend=False) data = [ trace_kde_nostruct, trace_median_nostruct, trace_kde_struct, trace_median_struct ] layout = go.Layout( xaxis=dict(title='number of sequences per family', tickvals=np.log([10, 100, 1000, 10000, 100000]), ticktext=[ "$10^1$", "$10^2$", "$10^3$", "$10^4$", "$10^5$", ], exponentformat="e", showexponent='All', zeroline=False), yaxis=dict(title='number of protein families', exponentformat="e", showexponent='All', zeroline=False), font=dict(size=18), legend=dict(x=0.75, y=0.88, orientation="v"), #horizontal legend below the plot title="PFAM family sizes <br> Pfam 31.0 (March 2017, 16712 entries)") #define plot figure fig = go.Figure(data=data, layout=layout) #plot with title plot_out = plot_dir + "/pfam_pdb.html" #plotly_plot(fig, filename=plot_out, auto_open=False) with_jax(fig, filename=plot_out) #plot without title fig['layout']['title'] = "" fig['layout']['margin']['t'] = 10 fig['layout']['margin']['b'] = 150 plot_out = plot_dir + "/pfam_pdb_notitle.html" #plotly_plot(fig, filename=plot_out, auto_open=False) with_jax(fig, filename=plot_out)
def best_split(data, I=(-np.inf, np.inf)): '''With bimodal data, finding split at lowest density.''' h_crit = critical_bandwidth_m_modes(data, 2, I) kde = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit(data.reshape(-1, 1)) x = np.linspace(max(np.min(data), I[0]), min(np.max(data), I[1]), 200) y = np.exp(kde.score_samples(x.reshape(-1, 1))) modes = argrelextrema(np.hstack([[0], y, [0]]), np.greater)[0] if len(modes) != 2: raise ValueError("{} modes at: {}".format(len(modes), x[modes - 1])) ind_min = modes[0] - 1 + argrelextrema(y[(modes[0] - 1):(modes[1] - 1)], np.less)[0] return x[ind_min] if __name__ == '__main__': import matplotlib.pyplot as plt if 1: N = 1000 data = np.hstack([np.random.randn(N / 2), np.random.randn(N / 4) + 4]) h_crit = critical_bandwidth_m_modes(data, 2) x = np.linspace(-3, 8) y = KernelDensity(kernel='gaussian', bandwidth=h_crit).fit( data.reshape(-1, 1)).score_samples(x.reshape(-1, 1)) fig, ax = plt.subplots() ax.plot(x, np.exp(y)) ax.axvline(best_split(data, (1, 4)), color='red') plt.show()
rho[i,0]=round(rho[i,0],3) df_confidence=pd.DataFrame(Index,columns=['Index']) df_confidence['Ident']=Ident df_confidence['Ypred']=Ypred df_confidence['rho']=rho df_confidence['Yoriginal']=Yoriginal # plot the histogram ff.rhoHist(rho,n_equal_bins=100) #X_plot = np.linspace(-1, 1, 100)[:, np.newaxis] X_plot = np.linspace(-1, 1, 100) X_plot = X_plot.reshape((-1,1)) #bins = np.linspace(-1, 1, 50) fig, ax = plt.subplots(figsize=(8, 4)) # tophat KDE #kde = KernelDensity(kernel='tophat', bandwidth=0.1).fit(rho) kde = KernelDensity(kernel='gaussian', bandwidth=0.04).fit(rho) log_dens = kde.score_samples(X_plot) ax.fill(X_plot, np.exp(log_dens), fc='#AAAAFF') #ax.fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') #ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") # calculate falserate according to different value of epsilon # eps==> maximum value of epsilon # num_eps==> the number of epsilon falseRate = ff.Predrejection(df_confidence,eps=0.8,num_eps=100)
#### SKLEARN KDE ################################## from sklearn.neighbors import KernelDensity #DELETE xyz = np.vstack([xi,yi,zi]) #original d = values2.shape[0] #num dimensions? should be 3 here n = values2.shape[1] #num samples? bwsklearn = (n * (d + 2) / 4.)**(-1. / (d + 4)) # silverman #bw = n**(-1./(d+4)) # scott print('SKLEARN bw (silverman): {}'.format(bwsklearn)) kde2 = KernelDensity( bandwidth=bwsklearn, metric='minkowski', #'euclidean',# kernel='gaussian', algorithm='ball_tree').fit( values2.T, y=None, sample_weight=None) #Should have shape (n_samples, n_features) #out42 = kde2.fit(values2.T, y=None, sample_weight=None) #Should have shape (n_samples, n_features) # xmin = np.min(xi) # xmax = np.max(xi) # ymin = np.min(yi) # ymax = np.max(yi) # zmin = np.min(zi) # zmax = np.max(zi) #positions = np.vstack([xi.ravel(), yi.ravel(), zi.ravel()]) #DELETE X, Y, Z = np.mgrid[xmin:xmax:50j, ymin:ymax:50j, zmin:zmax:50j] #DELETE positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()])
from sklearn.datasets import load_digits from sklearn.neighbors import KernelDensity from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV # load the data digits = load_digits() # project the 64-dimensional data to a lower dimension pca = PCA(n_components=15, whiten=False) data = pca.fit_transform(digits.data) # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params, cv=5, iid=False) grid.fit(data) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ # sample 44 new points from the data new_data = kde.sample(44, random_state=0) new_data = pca.inverse_transform(new_data) # turn data into a 4x11 grid new_data = new_data.reshape((4, 11, -1)) real_data = digits.data[:44].reshape((4, 11, -1))
def _bivariate_kdeplot(x, y, xscale=None, yscale=None, shade=False, bw="scott", gridsize=50, cut=3, clip=None, legend=True, legend_data=None, **kwargs): ax = plt.gca() label = kwargs.pop('label', None) # Determine the clipping clip = [(-np.inf, np.inf), (-np.inf, np.inf)] x = xscale(x) y = yscale(y) x_nan = np.isnan(x) y_nan = np.isnan(y) x = x[~(x_nan | y_nan)] y = y[~(x_nan | y_nan)] if bw == 'scott': bw_x = bw_scott(x) bw_y = bw_scott(y) bw = (bw_x + bw_y) / 2 elif bw == 'silverman': bw_x = bw_silverman(x) bw_y = bw_silverman(y) bw = (bw_x + bw_y) / 2 elif isinstance(bw, float): bw_x = bw_y = bw else: raise util.CytoflowViewError( None, "Bandwith must be 'scott', 'silverman' or a float") kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(np.column_stack((x, y))) x_support = _kde_support(x, bw_x, gridsize, cut, clip[0]) y_support = _kde_support(y, bw_y, gridsize, cut, clip[1]) xx, yy = np.meshgrid(x_support, y_support) z = kde.score_samples(np.column_stack((xx.ravel(), yy.ravel()))) z = z.reshape(xx.shape) z = np.exp(z) n_levels = kwargs.pop("n_levels", 10) color = kwargs.pop("color") kwargs['colors'] = (color, ) min_alpha = kwargs.pop("min_alpha", 0.2) if shade: min_alpha = 0 max_alpha = kwargs.pop("max_alpha", 0.9) x_support = xscale.inverse(x_support) y_support = yscale.inverse(y_support) xx, yy = np.meshgrid(x_support, y_support) contour_func = ax.contourf if shade else ax.contour try: cset = contour_func(xx, yy, z, n_levels, **kwargs) except ValueError as e: raise util.CytoflowViewError( None, "Something went wrong in {}, bandwidth = {}. ".format( contour_func.__name__, bw)) from e num_collections = len(cset.collections) alpha = np.linspace(min_alpha, max_alpha, num=num_collections) for el in range(num_collections): cset.collections[el].set_alpha(alpha[el]) # Label the axes if hasattr(x, "name") and legend: ax.set_xlabel(x.name) if hasattr(y, "name") and legend: ax.set_ylabel(y.name) if label is not None: ax.set_title(label) # Add legend data if 'label' in kwargs: legend_data[kwargs['label']] = plt.Rectangle((0, 0), 1, 1, fc=color) return ax
import numpy as np from scipy import stats import matplotlib.pyplot as pltV from sklearn.neighbors import KernelDensity fig, plt = pltV.subplots(1, 1) lamb = 1.5 t = stats.expon.rvs(size=20, scale=1 / lamb) c = stats.expon.rvs(size=80, scale=1 / lamb) j = stats.expon.rvs(size=150, scale=1 / lamb) print(t) print(c) print(j) print(stats.expon.fit(t)) print(stats.expon.fit(c)) print(stats.expon.fit(j)) x242 = np.linspace(0, 8).reshape(-1, 1) expo1 = stats.expon.pdf(x242) plt.plot(expo1, 'r-') kde1 = KernelDensity(kernel='exponential').fit(x242) norm2412 = np.exp(kde1.score_samples(x242)) plt.plot(norm2412) plt.plot(stats.expon.cdf(expo1), 'g-') pltV.show()
def bw_kde(data=[],start=0.01,end=1.0,cv_size=20): grid = GridSearchCV(KernelDensity(),{'bandwidth': np.linspace(start,end,cv_size)},cv=cv_size) grid.fit(data[:, None]) return grid.best_params_
'blizzard': ('2015-01-26 00:00:00', '2015-01-28 00:00:00') } taxi['event'] = np.zeros(len(taxi)) for event, duration in events.items(): start, end = duration taxi.loc[start:end, 'event'] = 1 for event, duration in events.items(): start, end = duration print("a") y = y.reshape(y.shape[0], 1) y = scale(y) # KDE kernaldens = KernelDensity(kernel="gaussian", bandwidth=0.75).fit(y) print(kernaldens) scores = kernaldens.score_samples(y) thresh = quantile(scores, .01) print(thresh) index = where(scores <= thresh) values = y[index] x_ax = range(y.shape[0]) plt.plot(x_ax, y) plt.scatter(index, values, color='r') plt.show() # TOOLTIPS = [ # ("index", "$index"),
def __init__(self, n_jobs=1, cv=5, bw=np.linspace(0.1, 1.0, 10)): self.grid = GridSearchCV(KernelDensity(), {'bandwidth': bw}, cv=cv, n_jobs=n_jobs) # 20-fold cross-validation
import matplotlib.pyplot as plt from distutils.version import LooseVersion from scipy.stats import norm from sklearn.neighbors import KernelDensity with open("repeated/BOHB_config/usage.json", encoding='utf-8') as f: data = json.load(f) _x = [] for params in data: channel_2_num = params["config"]["channel_2_num"] + 32 _x.append(channel_2_num) X = np.concatenate((_x, []))[:, np.newaxis] X_plot = np.linspace(30, 70, 1000)[:, np.newaxis] fig, ax = plt.subplots() kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="channel_2_num range= '{0}'".format('[32, 64]')) ax.legend(loc='upper left') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(30, 70) ax.set_ylim(0, 0.3) plt.show()
def _sample_posteriors(self, arm): kde = KernelDensity() kde.fit( pd.DataFrame( self.trace[arm]['mu'][-(self.samples_num - self.burn_num):])) return float(kde.sample())
n_samples = 1000 linear_gaussian_net = linear_gaussian_generation.generate_sparse_linear_gaussian_system(n_vars, max_deg, (0.2, 1), (-1, 1)) X = np.asarray(linear_gaussian_net.get_joint_samples(n_samples)).astype(np.float64) #print("X: ", X) ''' X = load_data.load_kde_cleaned_airline_data("Iberia").to_numpy() #X = X[:1000, :] X_train = X[:int(0.7 * X.shape[0]), :] X_test = X[int(0.7 * X.shape[0]):, :] bandwidth = silverman_scalar_bandwidth(X_train) print("bandwidth: ", bandwidth) kde_on_X = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(X_train) kde_on_X_test_log_likelihood = np.sum(kde_on_X.score_samples(X_test)) normal_dist_on_X = stats.multivariate_normal(mean=np.mean(X_train, axis=0), cov=np.cov(X_train.T)) normal_dist_on_X_test_log_likelihood = np.sum(normal_dist_on_X.logpdf(X_test)) print("X shape: ", X.shape) initial_dags = [ random_graph.random_dag(X.shape[1], max_deg) for i in range(0, 1) ] kernel = 'gaussian' print("kde_on_X_test_log_likelihood: ", kde_on_X_test_log_likelihood) print("normal dist test log likelihood: ", normal_dist_on_X_test_log_likelihood)
def optimize_bd(dfGenome, dfPos, dfGene, outpath): "Bandwidth optimization by fitting the density to positive set" dfPos['mid'] = ((dfPos['end'] - dfPos['start']) / 2) + dfPos['start'] chrs = list(dfGenome.chrom.unique()) bdlist = list(np.linspace(1000, 1000000, 1000)) sc = np.array([0.0] * (len(bdlist) + 1)) for chrname in chrs: chrlen = int(dfGenome[dfGenome.chrom == chrname].length) N = dfPos[dfPos.chrom == chrname].shape[0] dfchr = dfGene[dfGene.chrom == chrname] dfPosChr = dfPos[dfPos.chrom == chrname] Xp = np.array(list(dfPosChr['mid']))[:, np.newaxis] X = np.array(list(dfchr['mid']))[:, np.newaxis] ## estimate the density at each 1000 bp X_plot = np.linspace(0, chrlen, int(chrlen / 1000))[:, np.newaxis] b = np.array([[0, 0]]) print("optimization for", chrname) for bd in bdlist: kde = KernelDensity(kernel='gaussian', bandwidth=bd).fit(X) a = np.c_[bd, kde.score(Xp)] b = np.r_[b, a] sc[:] = sc[:] + b[:, 1] end = np.c_[bdlist, list(sc[1:, ])] idxrow = np.argwhere(end == max(end[:, 1]))[0, 0] newbd = int(end[idxrow, 0]) print("the bandwith is", newbd) #plt.plot(bdlist, list(sc[1:,])) #plt.title("genome") #plt.xlabel("bandwidth (bp)") #plt.ylabel("log score of positive set") #plt.savefig(path + 'gene_density_optimization.png') #plt.close() dfout = pd.DataFrame({'A': bdlist, 'B': sc[1:, ]}) dfout.to_csv(path_or_buf=outpath + "bandwidth_trials.txt", sep='\t', header=False, index=False) return newbd
sim_2niso_V = sim_2niso_V.reshape((LX, LX)) norm_2niso = np.sqrt((sim_2niso_U**2 + sim_2niso_V**2) / float(c2)) norm_2iso = np.sqrt((sim_2iso_U**2 + sim_2iso_V**2) / float(c2)) norm_2iso_histo = norm_2iso.reshape([LXA]) norm_2niso_histo = norm_2niso.reshape([LXA]) u_bins_iso_E6[G] = np.linspace(np.log(np.amin(norm_2iso_histo)), np.log(np.amax(norm_2iso_histo)), 2**12) u_bins_niso_E6[G] = np.linspace(np.log(np.amin(norm_2niso_histo)), np.log(np.amax(norm_2niso_histo)), 2**12) kde_iso = KernelDensity(bandwidth=0.25, kernel='gaussian') kde_iso.fit(np.log(norm_2iso_histo)[:, None]) logprob_iso_E6[G] = kde_iso.score_samples(u_bins_iso_E6[G][:, None]) kde_niso = KernelDensity(bandwidth=0.25, kernel='gaussian') kde_niso.fit(np.log(norm_2niso_histo)[:, None]) logprob_niso_E6[G] = kde_niso.score_samples(u_bins_niso_E6[G][:, None]) MDH.PushData(data=u_bins_iso_E6, key='u_bins_iso' + 'E6') MDH.PushData(data=u_bins_niso_E6, key='u_bins_niso' + 'E6') MDH.PushData(data=logprob_iso_E6, key='logprob_iso' + 'E6') MDH.PushData(data=logprob_niso_E6, key='logprob_niso' + 'E6') for G in [-1.75, -3.6]: print("G: ", G) sim_2iso_u = UFields['G' + str(G) + 'LX' + str(LX) + 'E8P4']
lastpdfhf, lastpdfcv = None, None for ii in [np.where(x == TEST)[0][0]]: thesex, thesey, vesey = [], [], [] for jj in range(epsh.shape[1]): ys.append(epsh[ii, jj]) y2s.append(epscv[ii, jj]) xs.append(x[ii]) thesey.append(y2s[-1]) vesey.append(ys[-1]) np.save('thesey%i' % BS, thesey) thesey = np.array(thesey) vesey = np.array(vesey) xx = np.linspace(np.min(vesey), max(vesey), 100) #xx = np.linspace(min(vesey), max(vesey), 100) print('---->', min(thesey), max(thesey)) kde2 = KernelDensity(kernel='gaussian', bandwidth=.05) kde2.fit(vesey.reshape(-1, 1)) pdff = np.exp(kde2.score_samples(xx.reshape(-1, 1))) if lastpdfcv: cvDs.append(entropy(pdff, laspdfcv)) lastpdfcv = pdff.copy() ax[0].plot(xx, pdff / simps(pdff, xx), label=BS, c=colors[ll]) #xx = np.linspace(0, 2 * max(thesey), 1000) xx = np.linspace(min(thesey), max(thesey), 100) kde = KernelDensity(kernel='gaussian', bandwidth=.01) kde.fit(thesey.reshape(-1, 1)) pdff = np.exp(kde.score_samples(xx.reshape(-1, 1))) if lastpdfhf: hfDs.append(entropy(pdff, laspdfhf)) lastpdfhf = pdff.copy() ax[1].plot(xx, pdff / simps(pdff, xx), label=BS, c=colors[ll]) ax[0].hist(vesey, 200, alpha=0.3, normed=True) ax[1].hist(thesey, 200, alpha=0.3, normed=True)
def extract_profiles_union(global_data,target_ind_dict,threshold,P): ## estimate the bandwith params = {'bandwidth': np.linspace(np.min(global_data), np.max(global_data),20)} grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0) ## perform MeanShift clustering. combine= {} for bull in target_ind_dict.keys(): grid.fit(global_data[target_ind_dict[bull],:]) combine[bull]= grid.best_estimator_ Stats= recursively_default_dict() for combo in it.combinations(target_ind_dict.keys(),2): pop1= combo[0] pop2= combo[1] All_coords= [x for x in it.chain(*[target_ind_dict[z] for z in combo])] Quanted_set= global_data[All_coords,:] i_coords, j_coords, z_coords = np.meshgrid(np.linspace(min(Quanted_set[:,0]),max(Quanted_set[:,0]),P), np.linspace(min(Quanted_set[:,1]),max(Quanted_set[:,1]),P), np.linspace(min(Quanted_set[:,2]),max(Quanted_set[:,2]),P), indexing= 'ij') traces= [x for x in it.product(range(P),range(P),range(P))] background= np.array([i_coords,j_coords,z_coords]) background= [background[:,c[0],c[1],c[2]] for c in traces] background=np.array(background) pop1_fist= combine[pop1].score_samples(background) #pop1_fist= np.exp(pop1_fist) P_dist_pop1= combine[pop1].score_samples(global_data[target_ind_dict[pop1],:]) pop1_fist = scipy.stats.norm(np.mean(P_dist_pop1),np.std(P_dist_pop1)).cdf(pop1_fist) pop1_fist= [int(x >= threshold) for x in pop1_fist] pop2_fist= combine[pop2].score_samples(background) #pop2_fist= np.exp(pop2_fist) P_dist_pop2= combine[pop2].score_samples(global_data[target_ind_dict[pop2],:]) pop2_fist = scipy.stats.norm(np.mean(P_dist_pop2),np.std(P_dist_pop2)).cdf(pop2_fist) pop2_fist= [int(x >= threshold) for x in pop2_fist] pop1_and_2= len([x for x in range(background.shape[0]) if pop1_fist[x] == 1 and pop2_fist[x] == 1]) pop1_I_pop2= pop1_and_2 / float(sum(pop1_fist)) pop2_I_pop1= pop1_and_2 / float(sum(pop2_fist)) total_overlap= pop1_and_2 / float(sum(pop1_fist) + sum(pop2_fist) - pop1_and_2) empty_space= 1 - (sum(pop1_fist) + sum(pop2_fist) - pop1_and_2) / background.shape[0] Stats[combo][pop1]= pop1_I_pop2 Stats[combo][pop2]= pop2_I_pop1 Stats[combo]['empty']= empty_space Stats[combo]['PU']= total_overlap return Stats
def calc_pdist(df, columns=None, mode="kde", bandwidth=None, grid=None, **kwargs): """ Calcualtes probability distribution over DataFrame. Arguments: df (DataFrame): DataFrame over which to calculate probability distribution of each column over rows columns (list): Columns for which to calculate probability distribution mode (ndarray, str, optional): Method of calculating probability distribution; eventually will support 'hist' for histogram and 'kde' for kernel density estimate, though presently only 'kde' is implemented bandwidth (float, dict, str, optional): Bandwidth to use for kernel density estimates; may be a single float that will be applied to all columns or a dictionary whose keys are column names and values are floats corresponding to the bandwidth for each column; for any column for which *bandwidth* is not specified, the standard deviation will be used grid (list, ndarray, dict, optional): Grid on which to calculate kernel density estimate; may be a single ndarray that will be applied to all columns or a dictionary whose keys are column names and values are ndarrays corresponding to the grid for each column; for any column for which *grid* is not specified, a grid of 1000 points between the minimum value minus three times the standard deviation and the maximum value plots three times the standard deviation will be used kde_kw (dict, optional): Keyword arguments passed to :function:`sklearn.neighbors.KernelDensity` verbose (int): Level of verbose output kwargs (dict): Additional keyword arguments Returns: OrderedDict: Dictionary whose keys are columns in *df* and values are DataFrames whose indexes are the *grid* for that column and contain a single column 'probability' containing the normalized probability at each grid point .. todo: - Implement flag to return single dataframe with single grid """ from sklearn.neighbors import KernelDensity # Process arguments verbose = kwargs.get("verbose", 1) if verbose >= 1: wiprint("""Calculating probability distribution over DataFrame""") if mode == "kde": # Prepare bandwidths if bandwidth is None: bandwidth = df.values.std() # Prepare grids if grid is None: grid = np.linspace(df.values.min() - 3 * bandwidth, df.values.max() + 3 * bandwidth, 1000) elif isinstance(grid, list): grid = np.array(grid) # Calculate probability distributions kde_kw = kwargs.get("kde_kw", {}) pdist = np.zeros((grid.size, df.columns.size)) for i, column in enumerate(df.columns.values): series = df[column] if verbose >= 1: wiprint( "calculating probability distribution of " "{0} using a kernel density estimate".format(column)) kde = KernelDensity(bandwidth=bandwidth, **kde_kw) kde.fit(series.dropna()[:, np.newaxis]) pdf = np.exp(kde.score_samples(grid[:, np.newaxis])) pdf /= pdf.sum() pdist[:, i] = pdf pdist = pd.DataFrame(pdist, index=grid, columns=df.columns) else: raise Exception( sformat("""only kernel density estimation is currently supported""")) return pdist
def DBRE_analyzer(filename): global df, reset_time, max_time, num_measurements, min_plateau_len, printplots, index try: #to read the text file raw_data = pd.read_csv(filename + '.DTA', sep='\t', header=None, usecols=[2, 3], skiprows=64, names=['Time', 'Voltage']) except: #if file is empty, wait reset_time time.sleep(reset_time) return DBRE_analyzer(filename) #check again if file is empty, and if so, wait reset_time before retrying if raw_data.empty: time.sleep(reset_time) return DBRE_analyzer(filename) #extract date, time, charging time, then convert to hours elapsed experimentnumber = filename[index:] f = open(filename + '.DTA', 'r') lines = f.readlines() datestamp = lines[3].split('\t')[2] timestamp = lines[4].split('\t')[2] datetimestamp = datetime.strptime(datestamp + ' ' + timestamp, '%m/%d/%Y %H:%M:%S') dt = datetimestamp - start_time hours = dt.total_seconds() / 3600 charging_time = float(lines[11].split('\t')[2]) f.close() #Export raw discharge curve to Excel datafile raw_data.to_excel(filename + '.xlsx') #filter out times past the maximum time raw_data = raw_data[raw_data.Time <= max_time] #If datafile is non-physical, skip to next if any(abs(raw_data.Voltage) > voltage_lims[2]): new_number = int(experimentnumber) + 1 if new_number > num_measurements: return 'Done' new_filename = filename[:index] new_filename = new_filename + str(new_number) return DBRE_analyzer( new_filename) #recursive loop until all files parsed #Produce discharge plot if printplots: plt.figure() plt.suptitle('Discharge for run #' + experimentnumber) #VOLTAGE PLOT top = plt.subplot(2, 1, 1) plt.plot(raw_data.Time, raw_data.Voltage) plt.axis([ -10, max_time, min(raw_data.Voltage), raw_data['Voltage'].iloc[-1] + 0.05 ]) plt.xlabel('Time (s)') plt.ylabel('Voltage (V)') #Filter out charging step raw_data = raw_data[raw_data.Time > charging_time] raw_data = raw_data.reset_index() #Stop if datafile is incomplete if raw_data.empty: return 'Done' #Use KDE to find plateau voltage_data = np.array(raw_data.Voltage) voltage_data = voltage_data.reshape(-1, 1) X_plot = np.linspace(np.amin(voltage_data), np.amax(voltage_data), 1000)[:, np.newaxis] kde = KernelDensity(bandwidth=0.01).fit(voltage_data) log_dens = kde.score_samples(X_plot) mi, ma = argrelextrema(log_dens, np.less)[0], argrelextrema(log_dens, np.greater)[0] if len(mi) > 0: i = 0 plateau = voltage_data[voltage_data < X_plot[mi[i]]] while len(plateau) < min_plateau_len and i + 1 < len(mi): plateau = voltage_data[np.logical_and( voltage_data < X_plot[mi[i + 1]], voltage_data > X_plot[mi[i]])] i += 1 if len(plateau) < min_plateau_len and i + 1 == len(mi): plateau = voltage_data[voltage_data > X_plot[mi[i]]] else: plateau = voltage_data weights = wts(plateau) voltage = -np.average(plateau, weights=weights) uncertainty = tstd(plateau) #If result is non-physical, skip to next datafile if voltage > voltage_lims[1] or voltage < voltage_lims[0]: new_number = int(experimentnumber) + 1 if new_number > num_measurements: return 'Done' new_filename = filename[:index] new_filename = new_filename + str(new_number) return DBRE_analyzer( new_filename) #recursive loop until all files parsed #plot KDE curve if printplots: bottom = plt.subplot(2, 1, 2) plt.plot(X_plot[:, 0], np.exp(log_dens), color='darkviolet', lw=2, linestyle='-') plt.plot(voltage_data[:, 0], -0.005 - 0.01 * np.random.random(voltage_data.shape[0]), '+k') plt.xlabel('Voltage (V)') plt.ylabel('Probability Density') plt.axis([np.amin(voltage_data), np.amax(voltage_data), -0.02, 2]) # plt.axis([np.amin(voltage_data), np.amax(voltage_data), -0.02, np.exp(np.amax(log_dens))+0.02]) #add plateau line to discharge plot if printplots: top.plot([-10, max_time], [-voltage, -voltage], '--k') #save the plot plt.savefig(filename + '.png', dpi=300) # Save the figure plt.close() #add info to overall Excel file, DBRE_Summary.xlsx df = df.append( { 'Hours': hours, 'Date': datestamp, 'Time': timestamp, 'Potential': voltage, 'Uncertainty': uncertainty }, ignore_index=True) #add values to overall dataframe df.to_excel('DBRE_Summary.xlsx') #plot salt potential over time after each trial is done plt.figure() plt.suptitle('Salt Potential Over Time') plt.errorbar(df.Hours, df.Potential, yerr=df.Uncertainty, color='blue', ecolor='black', fmt='o', capsize=5) plt.xlabel('Time (hr)') plt.ylabel('Salt Potential (V vs Be|Be2+)') plt.ticklabel_format(axis='x', style='plain', useOffset=False) plt.savefig('DBRE_Summary.png', dpi=300) plt.close() #prepare to either read next file or stop new_number = int(experimentnumber) + 1 if new_number > num_measurements: return 'Done' new_filename = filename[:index] new_filename = new_filename + str(new_number) return DBRE_analyzer(new_filename) #recursive loop until all files parsed
# In[18]: #计算SPE统计量 X_pca_SPE = Series(np.sum((X - X_pca_recover)**2, axis=1), index=X.index) # #### 采用置信度确定阈值 # ##### option1:使用scikit learn 的KDE API估计概率密度 # In[19]: from sklearn.neighbors import KernelDensity # In[20]: X_pca_T2_scikit_kde = KernelDensity().fit(X_pca_T2.reshape( -1, 1)) #reshape(-1,1)是API要求,否则视为一个点,概率密度就无从谈起了 X_pca_SPE_scikit_kde = KernelDensity().fit(X_pca_SPE.reshape(-1, 1)) # In[21]: X_pca_T2_sort = X_pca_T2.sort_values() plt.plot( np.exp(X_pca_T2_scikit_kde.score_samples(X_pca_T2_sort.reshape(-1, 1)))) # In[22]: X_pca_T2_dens_plot = np.linspace(0, 50, 1000) plt.plot( np.exp(X_pca_T2_scikit_kde.score_samples(X_pca_T2_dens_plot.reshape(-1, 1))))