def reps_necesarias(matr, eje_parám, eje_estoc, frac_incert, confianza): n_parám = matr.shape[eje_parám] n_estoc = matr.shape[eje_estoc] otras_dims = [e for i, e in enumerate(matr.shape) if i != eje_estoc] n_iter = 100 # podría ser mejor a ~200 matr_perc_estoc = np.zeros((*otras_dims, n_estoc - 1)) for i in range(2, n_estoc + 1): rango = np.zeros((n_iter, *otras_dims)) for j in range(n_iter): reps_e = np.random.choice(n_estoc, i, replace=False) matr_sel = np.take(matr, reps_e, axis=eje_estoc) prcntl = np.quantile(matr_sel, q=[(1 - frac_incert) / 2, 0.5 + frac_incert / 2], axis=eje_estoc) rango[j] = np.ptp(prcntl, axis=0) matr_perc_estoc[..., i - 2] = np.mean(rango, axis=0) x = 1 / np.arange(2, n_estoc + 1) a_0, b = _reg_lin(x, matr_perc_estoc, eje=-1) a = -1 / a_0 req_n_estoc = np.ceil(np.nanmax(1 / (a * b * (1 - confianza)))) if np.isnan(req_n_estoc): req_n_estoc = 1 otras_dims = [e for i, e in enumerate(matr.shape) if i != eje_parám and i != eje_estoc] matr_perc_prm = np.zeros((*otras_dims, n_parám - 1)) rango = np.zeros((n_iter, *otras_dims)) for i in range(2, n_parám + 1): for j in range(n_iter): reps_e = np.random.choice(n_parám, i, replace=False) matr_sel = np.take(matr, reps_e, axis=eje_parám) prcntl = np.quantile(matr_sel, q=[(1 - frac_incert) / 2, 0.5 + frac_incert / 2], axis=(eje_parám, eje_estoc)) rango[j] = np.ptp(prcntl, axis=0) matr_perc_prm[..., i - 2] = np.mean(rango, axis=0) x = 1 / np.arange(2, n_parám + 1) a_0, b = _reg_lin(x, matr_perc_prm, eje=-1) req_n_prm = np.ceil(np.nanmax(-a_0 / (b * (1 - confianza)))) if np.isnan(req_n_prm): req_n_prm = 1 return {'estoc': req_n_estoc, 'parám': req_n_prm}
def quantile(data_lst, weights=None, q=0.5): """ Return a specific quantile. Args: data_lst (list or np.ndarray): 1D data list to be used for computing quantiles q (float): The quantile, as a fraction between 0 and 1. Returns: (float) The computed quantile of the data_lst. """ q = float(q) return np.quantile(data_lst, q=q)
def bihist_eq(frame, params): """Bi-histogram equalization based on Kim (1997), returns an equalized version of the input frame""" # global image histogram bins = np.arange(257) vals, bins = np.histogram(frame.ravel(), bins=bins) # probability density function pdf = vals / np.prod(frame.shape) sp = np.quantile(frame, params['eq_sp']) # separation point rp = 255*params['eq_rp'] # range point # upper histogram upper = vals[bins[1:] > sp] u_bins = np.arange(sp+1, 257) u_pdf = upper / (np.sum(upper) + np.finfo(float).eps) u_cdf = np.cumsum(u_pdf) upper_eq = rp + (255 - rp) * (u_cdf - 0.5 * u_pdf) # lower histogram lower = vals[bins[1:] <= sp] l_bins = np.arange(sp) + 1 l_pdf = lower / (np.sum(lower) + np.finfo(float).eps) l_cdf = np.cumsum(l_pdf) lower_eq = rp * (l_cdf - 0.5 * l_pdf) # intensity value look-up table eq_lut = np.concatenate((lower_eq, upper_eq)) # equalized values eq_vals = [eq_lut[i] for i in frame.ravel()] # equalized frame eq_frame = np.reshape(eq_vals, frame.shape) return np.round(eq_frame).astype('uint8')
# model.compile(loss=HuberLoss(), optimizer = # %% # trying mc DROPOUT # force training mode = dropout on # force training mode = dropout on with keras.backend.learning_phase_scope(1): y_probas = np.stack([model.predict(X_test) for sample in range(100)]) y_proba = y_probas.mean(axis=0) X_test.shape y_test y_proba.T y_proba.shape y_proba.mean() y_proba.std() ci = 0.95 lower_lim = np.quantile(y_proba, 0.5-ci/2, axis=1) upper_lim = np.quantile(y_proba, 0.5+ci/2, axis=1) lower_lim upper_lim lower_lim==upper_lim
def q_99(arr: Union[pd.Series, np.ndarray]) -> float: arr = np.asarray(arr) return np.quantile(arr, 0.99)
def train(self, target_model, X_mal_train, X_mal_test, X_good_train, X_good_test, mal_label=1, good_label=0, earlystop=False, zmin=0, zmax=1, epochs=500, batch_size=32, combined_d_batch=False, d_train_mal=False, d_train_adv=True, good_batch_factor=1, d_times=1, gan_times=1, n_progress=1, minTPR_threshold=0, max_changes=np.inf, gan_dir=GAN_DIR, smooth_alpha=1.0, sample_train=True): """ Performs GAN training. :param target_model: The target model of the evasion attack :param X_mal_train: The malware training set :param X_mal_test: The malware test set :param X_good_train: The goodware training set :param X_good_test: The goodware test set :param mal_label: The label for the malware class (original label) :param good_label: The label for the goodware class (target label) :param zmin: The lower bound of the random noise :param zmax: The upper bound of the random noise :param epochs: The number of training epochs :param batch_size: The size of a training batch :param d_train_mal: Whether to train the disciminator on malware. :param combined_d_batch: Whether to train the discriminator on one batch that combine all classes or train on each eparately :param good_batch_factor: The size ratio of a goodware batch compared to that of a malware batch. :param d_times: The number of times to train the discriminator in each iteration. :param gan_times: The number of times to train the GAN in each iteration :param n_progress: The number of epochs with no improvement/output after which print ouput to check for progress. :param minTPR_threshold: The threshold to which we wish to minimise the the True Positive Rate (TPR). :param max_changes: A constraint on the maximum number of changes in generated adversarial examples (AEs) :return: tuple ( TPR_train: The list of TPR scores on the training set at each epoch, TPR_test: The list of TPR scores on the test set at each epoch, avg_diff_train: The list of avg changes in AEs generated from training set at each epoch, avg_diff_test: The list of avg changes in AEs generated from the test set at each epoch, d_metrics: The list of the discriminator metrics [loss, accuracy] at each epoch, gan_metrics: The list of the GAN metrics [loss, accuracy] at each epoch, best_G_path: The path to the best performing G model ) """ g_batch_size = good_batch_factor * batch_size # Metrics accumulators d_metrics = [] gan_metrics = [] # Initial TPR on the training & test sets TPR_train = [target_model.score(X_mal_train, mal_label * ones(X_mal_train.shape[0]))] TPR_test = [target_model.score(X_mal_test, mal_label * ones(X_mal_test.shape[0]))] minTPR = 1.0 minTPR_avg_changes = -1 minTPR_max_changes = -1 min_epoch = output_epoch = 0 best_G_path = None print(f"Initial TPR on the training set: {TPR_train}") print(f"Initial TPR on the test set: {TPR_test}\n") # Average changes (perturbations) in adversarial examples avg_diff_train = [] avg_diff_test = [] # IDs for plots plot_id = 1 gan_id = 1 tpr_id = 1 t1 = time.perf_counter() for epoch in range(epochs): # Generate batches of size (gan_times * batch_size) X_mal_batches = batch(X_mal_train, gan_times * batch_size, seed=epoch) # Epoch metrics accumulators d_metrics_epoch = np.empty((0, 2)) gan_metrics_epoch = np.empty((0, 2)) for X_mal_batch in X_mal_batches: ################################################################ # Train the discriminator for d_times iterations ################################################################ # Generate minibatches of size batch_size minibatches = batch(X_mal_batch, batch_size, seed=epoch) d_metrics_batch = np.empty((0, 2)) # Train for d_times for i in range(d_times): # __could reseed with (epoch + i) for reproducibility__ X_mal = next(minibatches, None) # Use these batches first if X_mal is None: # Then generate randomly X_mal = rand_batch(X_mal_train, batch_size) Y_mal = smooth_alpha * mal_label * ones( X_mal.shape[0]) # Smooth noise = np.random.uniform(zmin, zmax, size=[batch_size, self.z_dim]) # Generate adversarial examples X_adv = self.generator.predict([X_mal, noise]) X_adv = binarise(X_adv, self.bin_threshold) Y_adv = target_model.predict(X_adv) Y_adv[ Y_adv == mal_label] = smooth_alpha * mal_label # Smooth X_good = rand_batch(X_good_train, g_batch_size) Y_good = good_label * ones(X_good.shape[0]) # Good_Label # Train the discriminator self.discriminator.trainable = True if combined_d_batch: # *** Train once on a combined batch **** X = X_good Y = Y_good if d_train_mal: X = np.concatenate((X, X_mal)) Y = np.concatenate((Y, Y_mal)) if d_train_adv: X = np.concatenate((X, X_adv)) Y = np.concatenate((Y, Y_adv)) metrics = self.discriminator.train_on_batch(X, Y) else: # ** Train on separate batches & combine metrics ** metrics_good = self.discriminator.train_on_batch(X_good, Y_good) metrics_mal = self.discriminator.train_on_batch(X_mal, Y_mal) \ if d_train_mal else [np.nan, np.nan] metrics_adv = self.discriminator.train_on_batch(X_adv, Y_adv) \ if d_train_adv else [np.nan, np.nan] # Avg metrics metrics = np.nanmean(np.array([metrics_mal, metrics_good, metrics_adv]), axis=0) # Accumulate metrics for d_times iterations d_metrics_batch = np.vstack((d_metrics_batch, metrics)) # Average the metrics of all d_times iterations d_metrics_batch = np.mean(d_metrics_batch, axis=0) # Add to discriminator metrics for this epoch d_metrics_epoch = np.vstack((d_metrics_epoch, metrics)) ################################################################ # Train the Generator ################################################################ # Generate minibatches of size batch_size minibatches = batch(X_mal_batch, batch_size, seed=epoch) gan_metrics_batch = np.empty((0, 2)) # Train for gan_times for i in range(gan_times): # Number of minibatches should be exactly gan_times X_mal = next(minibatches, None) if X_mal is None: # Just in case, generate randomly X_mal = rand_batch(X_mal_train, batch_size) noise = np.random.uniform(zmin, zmax, size=[batch_size, self.z_dim]) self.discriminator.trainable = False # Train with target label = GOOD_LABEL metrics = self.GAN.train_on_batch([X_mal, noise], # <<<< good_label * ones( X_mal.shape[0])) # discriminator.trainable = True # Accumulate metrics for gan_times iterations gan_metrics_batch = np.vstack((gan_metrics_batch, metrics)) # Average the metrics of all gan_times iterations gan_metrics_batch = np.mean(gan_metrics_batch, axis=0) # Add to the generator metrics for this epoch gan_metrics_epoch = np.vstack((gan_metrics_epoch, metrics)) # Average metrics of each epoch d_metrics.append(np.mean(d_metrics_epoch, axis=0).tolist()) gan_metrics.append(np.mean(gan_metrics_epoch, axis=0).tolist()) gan_loss = gan_metrics[-1][0] # TPR on adversarial training set noise = np.random.uniform(zmin, zmax, (X_mal_train.shape[0], self.z_dim)) X_adv_train = binarise(self.generator.predict([X_mal_train, noise]), self.bin_threshold) # Score with target label = MAL_LABEL Y_adv_train = mal_label * ones(X_adv_train.shape[0]) # MAL_LABEL TPR = target_model.score(X_adv_train, Y_adv_train) TPR_train.append(TPR) # Changes (L1 norms) in the adversarial training set diff_train = norm((X_adv_train - X_mal_train), ord=1, axis=1) avg_diff_train_current = np.mean(diff_train) max_diff_train_current = np.max(diff_train) avg_diff_train.append(avg_diff_train_current) # TPR on adversarial test set noise = np.random.uniform(zmin, zmax, (X_mal_test.shape[0], self.z_dim)) X_adv_test = binarise(self.generator.predict([X_mal_test, noise]), self.bin_threshold) Y_adv_test = mal_label * ones(X_adv_test.shape[0]) # MAL_LABEL TPR = target_model.score(X_adv_test, Y_adv_test) TPR_test.append(TPR) # Changes (L1 norms) in the adversarial test set diff_test = norm((X_adv_test - X_mal_test), ord=1, axis=1) avg_diff_test_current = np.mean(diff_test) max_diff_test_current = np.max(diff_test) avg_diff_test.append(avg_diff_test_current) # Output progress if TPR has decreased (improved evasion) # ... or if TPR is the same but avg changes have decreased if (TPR < minTPR) or \ (TPR == minTPR and avg_diff_test_current < minTPR_avg_changes): # check avg or max print("\n>>>> New Best Results: " f"Previous minTPR: [{minTPR:.8f}] ==> " f"New minTPR: [{TPR:0.8f}] " f"GAN Loss: [{gan_loss:.8f}] <<<<") output_progress(epoch, TPR_train, TPR_test, diff_train, diff_test) minTPR = TPR min_epoch = output_epoch = epoch minTPR_avg_changes = avg_diff_test_current minTPR_max_changes = max_diff_test_current minTPR_std = np.std(diff_test) minTPR_quantiles = np.quantile(diff_test, [0.25, 0.5, 0.75]) # Save weights minTPR_weights_path = \ (gan_dir + self.save_dir + 'weights/' + f'GAN_minTPR_weights_epoch_{epoch}_' f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_' f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_' + time.strftime("%m-%d_%H-%M-%S") + '.h5') self.GAN.save_weights(minTPR_weights_path) # Generate and plot a sample of AEs sample_sz = 10 sample_noise = np.random.uniform(zmin, zmax, size=[sample_sz, self.z_dim]) if sample_train: # Sample from training sample_mal = rand_batch(X_mal_batch, sample_sz) else: # Sample from test set sample_mal = np.asarray(rand_batch(X_mal_test, sample_sz)) plot_sample(sample_mal, sample_noise, self.generator, target_model, epoch, TPR_train=TPR_train, TPR_test=TPR_test, params=self.log_params, avg_changes=avg_diff_test_current, m_label=mal_label, g_label=good_label, annotate=False, out_dir=ADV_DIR, plot_id=plot_id) plot_id = plot_id + 1 if minTPR <= minTPR_threshold: print( "\n" + "#" * 150 + "\n" f"# Target Evasion Rate {100 * (1 - TPR):.2f}% " f"achieved at epoch [{epoch}], " f"with avg {avg_diff_test_current:.1f} " f"& max {max_diff_test_current:.1f} changes per sample " f"(on the test set) ... " f"GAN Loss: [{gan_loss:.8f}]" "\n" + "#" * 150 + "\n" ) if minTPR_avg_changes <= max_changes: print("Training CONVERGED. " "Target Evasion Rate achieved within max changes..." "TRAINING ENDS HERE #") # Save generator best_G_path = \ (gan_dir + self.save_dir + 'models/' + f'G_Target_TPR_epoch_{epoch}_' f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_' f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_' + time.strftime("%m-%d_%H-%M-%S") + '.h5') self.generator.save(best_G_path) if earlystop: break # If no better than minTPR, but still achieved target evasion, ... elif TPR <= minTPR_threshold: # output_epoch = epoch print( "\n" + "#" * 150 + "\n" f"# Target Evasion Rate {100 * (1 - TPR):.2f}% " f"achieved at epoch [{epoch}] " f"with avg {avg_diff_test_current:.1f} " f"and max {max_diff_test_current:.1f} changes per sample " f"(on the test set) ... " f"GAN Loss: [{gan_loss:.8f}]" "\n" + "#" * 150 + "\n" ) # Save weights weights_path = \ (gan_dir + self.save_dir + 'weights/' + f'GAN_minTPR_weights_epoch_{epoch}_' f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_' f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_' + time.strftime("%m-%d_%H-%M-%S") + '.h5') # self.GAN.save_weights(file_path) # If within max changes if avg_diff_test_current <= max_changes: # check avg or max? print("Target Evasion Rate achieved within max changes...") # Save model model_path = \ (gan_dir + self.save_dir + 'models/' + f'GAN_Target_TPR_epoch_{epoch}_' f'TPR_{minTPR:.2f}_dtimes_{d_times}_changes_' f'{avg_diff_test_current:.0f}_actReg_{self.reg[0]}_' + time.strftime("%m-%d_%H-%M-%S") + '.h5') # self.GAN.save(model_path) if earlystop: break else: print() # Maybe adjust weights # print("Should we adjust regulizers?") # generator.layers[-2].rate *= 0.1 # generator.layers[-3].activity_regularizer.l1 *= 0.1 # generator.layers[-3].activity_regularizer.l2 *= 0.1 # weights = generator.get_weights() # generator = keras.models.clone_model(generator) # generator.set_weights(weights) # Adapt regularisation weights # K.set_value(l1_factor, 0.1*l1_factor) # K.set_value(l2_factor, 0.1*l2_factor) if (epoch + 1 - output_epoch) > n_progress: # If no new imporovement for for a while, output progress output_epoch = epoch print(f"\n*** Checking progress *** " f"GAN Loss: [{gan_loss:.8f}] ***") output_progress(epoch, TPR_train, TPR_test, diff_train, diff_test) # Generate and plot a sample of AEs sample_sz = 10 sample_noise = np.random.uniform(zmin, zmax, size=[sample_sz, self.z_dim]) sample_mal = rand_batch(X_mal_batch, sample_sz) plot_sample(sample_mal, sample_noise, self.generator, target_model, epoch, TPR_train=TPR_train, TPR_test=TPR_test, params=self.log_params, avg_changes=avg_diff_test_current, m_label=mal_label, g_label=good_label, annotate=False, out_dir=ADV_DIR, plot_id=plot_id) plot_id = plot_id + 1 t2 = time.perf_counter() print("\n\n" + "#" * 165 + "\n" f"# Finished {epoch + 1} epochs in {(t2 - t1) / 60:.2f} minutes\n" f"# Best Evastion Rate = {100 * (1 - minTPR):.4f}% " f"(lowest TPR = {100 * minTPR:.4f}%) " f"achieved after {min_epoch + 1} epochs, with avg " f"{minTPR_avg_changes:.1f} \u00b1 SD({minTPR_std:.1f}) | " f" Q1-3 {minTPR_quantiles.astype(int).tolist()} | " f" and max {minTPR_max_changes:.1f} " f"changes per sample.\n" + "#" * 165 + "\n\n") return TPR_train, TPR_test, \ avg_diff_train, avg_diff_test, \ d_metrics, gan_metrics, \ best_G_path
data = data.drop(columns=['User_ID', 'Product_ID']) # Input features and target names definition in_features = data.columns.drop(['Purchase']) target = 'Purchase' # Training and testing split (random split) random.seed = 0 train_id = random.sample(range(0, data.shape[0]), 440054) test_id = list(set(np.arange(0, data.shape[0])) - set(train_id)) train_data = data.iloc[train_id, :] test_data = data.iloc[test_id, :] train_data["Purchase_level"] = train_data[target] > np.quantile( train_data[target], 0.75) test_data["Purchase_level"] = test_data[target] > np.quantile( test_data[target], 0.75) train_data["Purchase_level"] = train_data["Purchase_level"].apply( lambda x: int(x == True)) test_data["Purchase_level"] = test_data["Purchase_level"].apply( lambda x: int(x == True)) in_features = train_data.columns.drop(['Purchase_level', 'Purchase']) target = 'Purchase_level' # ============================================================================= time_list.append(time.time()) # ============================================================================= # =============================================================================
def select_data(dataframe, ccd, rawy_range=(1, 200), filter_select=None): """Make a selection of data from the input dataframe and return the selected dataframe Parameters ---------- dataframe : dataframe, mandatory the pandas dataframe with the Cu Kalpha fit results (from Michael Smith monitoring run). Produced by `ff_monitoring_work2.ipynb` ccd : int, mandatory the EPIC-pn CCD number (from 1 to 12) rawy_range: list, optional the RAWY range selection, default the full CCD range (1,200) filter_select : str if not None, then a selection on filter wheel is requested, can be one of 'CalClosed', 'CalMedium', 'CalThick', 'CalThin1', 'Closed', 'Medium', 'Thick', 'Thin1', 'Thin2'. If None, then all are selected. Output ------ df_out, pandas dataframe A new dataframe with selected records, sorted on `delta_time` Method ------ First, a selection based on CCD and RAWY range is done. Then the further filtering based on * best-fit Gaussian line sigma, within (16,84)% quantiles * exposure time (>= 10 ks) * number of dicarded lines (<= 300), only applied for FF mode. * best-fit line energy mean error ( <= 20 eV) and neither the upper or lower error bar is zero. * if filter_select is used, then also select on filter. The filtering is just to discard bad fit or poor fit results. And we discard duplicates (if any) based on the `delta_time` (time in years since 2000-01-01) and finally sort on `delta_time`. """ df_ccd = dataframe[(dataframe.ccd == ccd) & (dataframe.rawy0 == rawy_range[0]) & (dataframe.rawy1 == rawy_range[1])] ntot, _ = df_ccd.shape df_ccd.xmode = dataframe.xmode # # get the quantile distribution (16,50,84) of best fit Gaussian line sigma # will use th elower and upper quantile to filter the bad fit results # qq = np.quantile(df_ccd.sigma, (0.16, 0.84)) fwhm = np.rint(qq * SIG2FWHM).astype(int) qq = np.rint(qq).astype(int) # if (df_ccd.xmode == 0): xmode = 'FF' if (filter_select is not None): df_out = df_ccd[ (df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0) & (df_ccd.rawy0 == rawy_range[0]) & (df_ccd.rawy1 == rawy_range[1]) & ((df_ccd.sigma >= qq[0]) & (df_ccd.sigma <= qq[1])) & #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) & ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <= 20.0) & (df_ccd.ndl <= 300.0) & (df_ccd['filter'] == filter_select) & (df_ccd.energy_err1 * df_ccd.energy_err2 > 0.0)].drop_duplicates('delta_time') else: df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0) & (df_ccd.rawy0 == rawy_range[0]) & (df_ccd.rawy1 == rawy_range[1]) & ((df_ccd.sigma >= qq[0]) & (df_ccd.sigma <= qq[1])) & #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) & ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <= 20.0) & (df_ccd.ndl <= 300.0) & (df_ccd.energy_err1 * df_ccd.energy_err2 > 0.0)].drop_duplicates('delta_time') elif (df_ccd.xmode == 1): xmode = 'EFF' if (filter_select is not None): df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0) & (df_ccd.rawy0 == rawy_range[0]) & (df_ccd.rawy1 == rawy_range[1]) & ((df_ccd.sigma >= qq[0]) & (df_ccd.sigma <= qq[1])) & #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) & ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <= 20.0) & (df_ccd['filter'] == filter_select) & (df_ccd.energy_err1 * df_ccd.energy_err2 > 0.0)].drop_duplicates('delta_time') else: df_out = df_ccd[(df_ccd.ccd == ccd) & (df_ccd.expo_time >= 10000.0) & (df_ccd.rawy0 == rawy_range[0]) & (df_ccd.rawy1 == rawy_range[1]) & ((df_ccd.sigma >= qq[0]) & (df_ccd.sigma <= qq[1])) & #(df_ccd.chi2r <= 3.0) & (df_ccd.dof >= 10) & ((df_ccd.energy_err1 + df_ccd.energy_err2) / 2.0 <= 20.0) & (df_ccd.energy_err1 * df_ccd.energy_err2 > 0.0)].drop_duplicates('delta_time') # else: print( f'Cannot process mode={df_ccd.xmode}, only mode=0 (FF) or mode=1 (EFF).' ) return None # _ = df_out.sort_values(by='delta_time', inplace=True) df_out.xmode = dataframe.xmode # return df_out
async def run(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) async with Cluster(*cluster_args, **cluster_kwargs, asynchronous=True) as cluster: if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) # Use the scheduler address with an SSHCluster rather than the cluster # object, otherwise we can't shut it down. async with Client(scheduler_addr if args.multi_node else cluster, asynchronous=True) as client: scheduler_workers = await client.run_on_scheduler( get_scheduler_workers) await client.run( setup_memory_pool, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. await client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) took_list = [] for i in range(args.runs): took_list.append(await _run(client, args)) # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = await client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): [ "%s/s" % format_bytes(x) for x in np.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} print("Roundtrip benchmark") print("--------------------------") print(f"Size | {args.size}*{args.size}") print(f"Chunk-size | {args.chunk_size}") print(f"Ignore-size | {format_bytes(args.ignore_size)}") print(f"Protocol | {args.protocol}") print(f"Device(s) | {args.devs}") if args.device_memory_limit: print( f"memory-limit | {format_bytes(args.device_memory_limit)}") print("==========================") print("Wall-clock | npartitions") print("--------------------------") for (took, npartitions) in took_list: t = format_time(took) t += " " * (12 - len(t)) print(f"{t} | {npartitions}") print("==========================") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("--------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.benchmark_json: bandwidths_json = { "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s")) for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip( ["25%", "50%", "75%", "total_nbytes"], [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]], ) } with open(args.benchmark_json, "a") as fp: for took, npartitions in took_list: fp.write( dumps( dict( { "size": args.size * args.size, "chunk_size": args.chunk_size, "ignore_size": args.ignore_size, "protocol": args.protocol, "devs": args.devs, "device_memory_limit": args.device_memory_limit, "worker_threads": args.threads_per_worker, "rmm_pool": not args.disable_rmm_pool, "tcp": args.enable_tcp_over_ucx, "ib": args.enable_infiniband, "nvlink": args.enable_nvlink, "wall_clock": took, "npartitions": npartitions, }, **bandwidths_json, )) + "\n") # An SSHCluster will not automatically shut down, we have to # ensure it does. if args.multi_node: await client.shutdown()
"subject_nickname", color='k', linewidth=0) # this will be empty, hack # now plot the datapoints, no errorbars sns.lineplot(data=behav.loc[behav.task == 'traini', :], x='signed_contrast', y='choice2', marker='o', err_style='bars', color='k', linewidth=0, ci=95, ax=fig.ax) # overlay the simulated # confidence intervals from the model - shaded regions fig.ax.fill_between(sorted(behav.signed_contrast.unique()), np.quantile(np.array(simulation_basic), q=0.025, axis=0), np.quantile(np.array(simulation_basic), q=0.975, axis=0), alpha=0.5, facecolor='k') fig.set_axis_labels(' ', 'Rightward choices (%)') fig.despine(trim=True) fig.savefig(os.path.join(figpath, "figure5b_basic_psychfunc.pdf")) # FULL TASK plt.close('all') fig = sns.FacetGrid(behav.loc[behav.task == 'biased', :], hue="probabilityLeft", palette=cmap, sharex=True, sharey=True, height=FIGURE_HEIGHT,
def quantile_normalize(im, low=.01, high=.99): im = im.astype('float32') tlow, thigh = np.quantile(im, low), np.quantile(im, high) im -= tlow im /= thigh return np.clip(im, 0., 1.)
def get_radius(dist: torch.Tensor, nu: float): """Optimally solve for radius R via the (1-nu)-quantile of distances.""" return np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - nu)
eprint("OS error:", err) except: eprint("unexpected error:", sys.exc_info()[0]) raise else: eprint('\ndataset has', len(dataset), 'entries\n') # format dataset matrix = preprocess(dataset) # normalize data dataset_normalized = normalize(matrix) users_to_recommend = list(dataset_normalized.user.values) # split data into training and testing training_data, testing_data = split_data(dataset_normalized) num_items = len(list(set(training_data.to_dataframe().item.values))) # show raw data stats raw_variable_quantiles = np.quantile(dataset.variable.values, [0, .25, .5, .75, 1]) eprint('\nquantiles:', raw_variable_quantiles, '\n') with mlflow.start_run(): try: # train and store model model = create_model(training_data) # create recomendations recom = model.recommend(users=users_to_recommend, k=recom_n) except: eprint('run failed') raise else: eprint('\nsaving recommendations...\n') save_recom(recom) # calculate metrics eprint('\n*** calculating metrics ***\n')
mg = load_metagraph("G", version="2020-04-01") mg = preprocess( mg, threshold=0, sym_threshold=False, remove_pdiff=True, binarize=False, weight="weight", ) meta = mg.meta # plot where we are cutting out nodes based on degree degrees = mg.calculate_degrees() fig, ax = plt.subplots(1, 1, figsize=(5, 2.5)) sns.distplot(np.log10(degrees["Total edgesum"]), ax=ax) q = np.quantile(degrees["Total edgesum"], 0.05) ax.axvline(np.log10(q), linestyle="--", color="r") ax.set_xlabel("log10(total synapses)") # remove low degree neurons idx = meta[degrees["Total edgesum"] > q].index mg = mg.reindex(idx, use_ids=True) # remove center neurons # FIXME idx = mg.meta[mg.meta["hemisphere"].isin(["L", "R"])].index mg = mg.reindex(idx, use_ids=True) mg = mg.make_lcc() mg.calculate_degrees(inplace=True) meta = mg.meta meta["inds"] = range(len(meta))
def test_large_epsilon(self): a = np.random.random(1000) res = np.quantile(a, 0.5) res_dp = quantile(a, 0.5, epsilon=5, bounds=(0, 1)) self.assertAlmostEqual(float(res), float(res_dp), delta=0.01)
#!/usr/bin/env python import numpy as np data = range(1000) q = [0.01, 0.99] res = np.quantile(data, q) print('res = {}'.format(res))
if epoch >= 200: sample.append(X) predicts.append(numpy.mean(sample, axis=0) / N) return numpy.array(predicts) for N in nlist: for alpha in [1.0, 0.1, 0.01]: predicts = gibbs_sampling(N, alpha) start = predicts.min() end = predicts.max() bins = 40 step = (end - start) / bins plt.hist(predicts, bins=numpy.arange(start, end, step), density=True) plt.title("N = %d, alpha = %.2f" % (N, alpha)) plt.legend(legend) plt.tight_layout() plt.savefig("rr-gibbs-%d-%.2f.png" % (N, alpha)) plt.close() print("N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" % (N, alpha)) print( numpy.vstack(([ true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0) ], numpy.quantile(predicts, [0.025, 0.975, 0.5], axis=0))))
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Fri Jan 18 16:48:42 2019 @author: scott """ import re import numpy as np import sys x = [] with open(sys.argv[1], 'r') as tr: for line in tr: t = re.findall(r'\d?\.?\d+\.?\d?(?=:)|(?<=\):)\d+\.?\d+', line.strip()) x.append(sum(map(float, t))) if sum(map(float, t)) > 10: print(line) print(np.mean(x)) print(np.quantile(x, 0.025)) print(np.quantile(x, 0.975))
def fit(self, X, y, implement_fixed_controls=False, patholog_dirn=None): #* Requires direction of disease progression as input if patholog_dirn is None: patholog_dirn = disease_direction(X, y) # ####### Diagnostic # if patholog_dirn < 0: # print('kde.py DIAGNOSTIC: fit(), Disease progresses with decreasing biomarker values - ') # elif patholog_dirn > 0: # print('kde.py DIAGNOSTIC: fit(), Disease progresses with increasing biomarker values + ') # else: # print('kde.py DIAGNOSTIC. fit(), ERROR: Disease direction in fit(...,patholog_dirn) must be either positive or negative. \n patholog_dirn = {0]}'.format(patholog_dirn)) # ####### sorted_idx = X.argsort(axis=0).flatten() kde_values = X.copy()[sorted_idx].reshape(-1, 1) kde_labels0 = y.copy()[sorted_idx] kde_labels = kde_labels0 #print('Original labels') #print(kde_labels.astype(int)) bin_counts = np.bincount(y).astype(float) mixture0 = sum(kde_labels == 0) / len( kde_labels) # Prior of being a control mixture = mixture0 old_ratios = np.zeros(kde_labels.shape) iter_count = 0 if (self.bandwidth is None): #* 1. Rule of thumb self.bandwidth = hscott(X) # #* 2. Estimate full density to inform variable bandwidth: wide in tails, narrow in peaks # all_kde = neighbors.KernelDensity(kernel=self.kernel, # bandwidth=self.bandwidth) # all_kde.fit(kde_values) # f = np.exp(all_kde.score_samples(kde_values)) # #* 3. Local, a.k.a. variable, bandwidth given by eq. 3 of https://ieeexplore.ieee.org/abstract/document/7761150 # g = stats.mstats.gmean(f) # alpha = 0.5 # sensitivity parameter: 0...1 # lamb = np.power(f/g,-alpha) for i in range(self.n_iters): # print('Iteration {0}. kde_labels = {1}'.format(i,[int(k) for k in kde_labels])) #* Automatic variable/local bandwidth for each component: awkde package from github controls_kde = GaussianKDE(glob_bw="scott", alpha=self.beta, diag_cov=False) patholog_kde = GaussianKDE(glob_bw="scott", alpha=self.alpha, diag_cov=False) # controls_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False) # patholog_kde = GaussianKDE(glob_bw="scott", alpha=0.1, diag_cov=False) controls_kde.fit(kde_values[kde_labels == 0]) patholog_kde.fit(kde_values[kde_labels == 1]) controls_score = controls_kde.predict(kde_values) patholog_score = patholog_kde.predict(kde_values) controls_score = controls_score * mixture patholog_score = patholog_score * (1 - mixture) ratio = controls_score / (controls_score + patholog_score) # print('Iteration {0}. ratio (percent) = {1}'.format(i,[int(r*100) for r in ratio])) #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5) cdf_controls = np.cumsum(controls_score) / max( np.cumsum(controls_score)) cdf_patholog = np.cumsum(patholog_score) / max( np.cumsum(patholog_score)) cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog + cdf_controls) disease_dirn = -np.sign(np.nansum( cdf_diff)) # disease_dirn = -np.sign(np.mean(cdf_diff)) if disease_dirn > 0: cdf_direction = 1 + cdf_diff else: cdf_direction = -cdf_diff #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels if patholog_dirn < 0: #* More normal (greater) than half the controls: CDF_controls > 0.5 labels_forced_normal = cdf_controls > 0.5 labels_forced_normal_alt = kde_values > np.median( kde_values[kde_labels0 == 0]) elif patholog_dirn > 0: #* More normal (less) than half the controls: CDF_controls < 0.5 labels_forced_normal = cdf_controls < 0.5 labels_forced_normal_alt = kde_values < np.median( kde_values[kde_labels0 == 0]) #* FIXME: Make this a prior and change the mixture modelling to be Bayesian #* First iteration only: implement "prior" that flips healthy-looking patients (before median for controls) to pre-event label #* Refit the KDEs at this point if i == 0: #* Disease direction: force pre-event/healthy-looking patients to flip kde_labels[np.where(labels_forced_normal_alt)[0]] = 0 bin_counts = np.bincount(kde_labels).astype(float) mixture = bin_counts[0] / bin_counts.sum() #* Refit the KDE components. FIXME: this is copy-and-paste from above. Reimplement in a smarter way. controls_kde.fit(kde_values[kde_labels == 0]) patholog_kde.fit(kde_values[kde_labels == 1]) controls_score = controls_kde.predict(kde_values) patholog_score = patholog_kde.predict(kde_values) controls_score = controls_score * mixture patholog_score = patholog_score * (1 - mixture) ratio = controls_score / (controls_score + patholog_score) #* Empirical cumulative distribution: used to swap labels for patients with super-normal values (greater/less than CDF=0.5) cdf_controls = np.cumsum(controls_score) / max( np.cumsum(controls_score)) cdf_patholog = np.cumsum(patholog_score) / max( np.cumsum(patholog_score)) cdf_diff = (cdf_patholog - cdf_controls) / (cdf_patholog + cdf_controls) disease_dirn = -np.sign(np.nansum( cdf_diff)) # disease_dirn = -np.sign(np.mean(cdf_diff)) if disease_dirn > 0: cdf_direction = 1 + cdf_diff # print('Disease direction is estimated to be POSTIIVE') else: cdf_direction = -cdf_diff # print('Disease direction is estimated to be NEGATIVE') #* Identify "normal" biomarkers as being on the healthy side of the controls median => flip patient labels if patholog_dirn < 0: #* More normal (greater) than half the controls: CDF_controls > 0.5 labels_forced_normal = cdf_controls > 0.5 labels_forced_normal_alt = kde_values > np.median( kde_values[kde_labels0 == 0]) elif patholog_dirn > 0: #* More normal (less) than half the controls: CDF_controls < 0.5 labels_forced_normal = cdf_controls < 0.5 labels_forced_normal_alt = kde_values < np.median( kde_values[kde_labels0 == 0]) if (np.all(ratio == old_ratios)): # print('MM finished in {0} iterations'.format(iter_count)) break iter_count += 1 old_ratios = ratio kde_labels = ratio < 0.5 #* Labels to swap: diff_y = np.hstack( ([0], np.diff(kde_labels))) # !=0 where adjacent labels differ if ((np.sum(diff_y != 0) >= 2) & (np.unique(kde_labels).shape[0] == 2)): split_y = int( np.all(np.diff(np.where(kde_labels == 0)) == 1) ) # kde_label upon which to split: 1 if all 0s are adjacent, 0 otherwise sizes = [ x.shape[0] for x in np.split(diff_y, np.where(diff_y != 0)[0]) ] # lengths of each contiguous set of labels #* Identify which labels to swap using direction of abnormality: avg(controls) vs avg(patients) #* N ote that this is now like k-medians clustering, rather than k-means split_prior_smaller = (np.median( kde_values[kde_labels == split_y]) < np.median( kde_values[kde_labels == (split_y + 1) % 2])) if split_prior_smaller: replace_idxs = np.arange(kde_values.shape[0])[ -sizes[2]:] # greater values are swapped else: replace_idxs = np.arange( kde_values.shape[0] )[:sizes[0]] # lesser values are swapped kde_labels[replace_idxs] = (split_y + 1) % 2 # swaps labels #* Disease direction: force pre-event/healthy-looking patients to flip kde_labels[np.where(labels_forced_normal_alt)[0]] = 0 #*** Prevent label swapping for "strong controls" fixed_controls_criteria_0 = (kde_labels0 == 0) # Controls # #*** CDF criteria - do not delete: potentially also used for disease direction # en = 10 # cdf_threshold = (en-1)/(en+1) # cdf(p) = en*(1-cdf(c)), i.e., en-times more patients than remaining controls # controls_tail = cdf_direction > (cdf_threshold * max(cdf_direction)) # #fixed_controls_criteria_0 = fixed_controls_criteria_0 & (~controls_tail) # #*** PDF ratio criteria # ratio_threshold_strong_controls = 0.33 # P(control) / [P(control) + P(patient)] # fixed_controls_criteria = fixed_controls_criteria & (ratio > ratio_threshold_strong_controls) # "Strong controls" #*** Outlier criteria for weak (e.g., low-performing on test; or potentially prodromal in sporadic disease) controls: quantiles q = 0.90 # x-tiles if disease_dirn > 0: q = q # upper f = np.greater g = np.less # print('Disease direction: positive') else: q = 1 - q # lower f = np.less g = np.greater # print('Disease direction: negative') extreme_cases = f(kde_values, np.quantile(kde_values, q)).reshape(-1, 1) #& (kde_labels0==0) fixed_controls_criteria = fixed_controls_criteria_0.reshape( -1, 1) & ~(extreme_cases) if implement_fixed_controls: kde_labels[np.where(fixed_controls_criteria)[0]] = 0 #kde_labels[np.where(controls_outliers)[0]] = 1 # Flip outlier controls bin_counts = np.bincount(kde_labels).astype(float) mixture = bin_counts[0] / bin_counts.sum() if (mixture < 0.10 or mixture > 0.90): # if(mixture < (0.90*mixture0) or mixture > 0.90): # print('MM finished (mixture weight too low/high) in {0} iterations'.format(iter_count)) break self.controls_kde = controls_kde self.patholog_kde = patholog_kde self.mixture = mixture self.iter_ = iter_count return self
MZ = [] peak_no = 0 peak_clusters_no = 0 add_clusters = 0 for pc in ppm_dist_clusters(sp.peaks(), 100.0): if len(pc) > len(sp)*quorum and pc.peaks_from_different_spectra(): for p in pc: MZ.append(p.mz) peak_no += len(pc) peak_clusters_no += 1 add_clusters += max(pc.which_spectra().values()) - 1 MZ = np.array(MZ) dMZ = np.diff(MZ) last_peak_diff = (peak_no-1-peak_clusters_no)/(peak_no-1) quantile_distance = np.quantile(2*dMZ/(MZ[1:]+MZ[:-1])*1e6, last_peak_diff) P = np.linspace( 0,1,10000) Q = np.quantile(2*dMZ/(MZ[1:]+MZ[:-1])*1e6, P) plt.plot(Q, P) plt.scatter(quantile_distance, last_peak_diff) plt.show() # what if we modified the k? MZ = A([p.mz for p in sp.peaks()]) N = A([p.spec_no for p in sp.peaks()]) dMZ = np.diff(MZ) peak_no = len(MZ) last_peak_diff = (peak_no-1-peak_clusters_no)
print("Using all the historical data, without the year 2020, for the month =", m, \ ", during the week between days", fd, "and", ed, "the mean flow is =", flow_mean1) print("") print("Using the last 10 years data, without the year 2020, for the month =", m, \ ", during the week between days", fd, "and", ed, "the mean flow is =", flow_mean2) print("") print("In the year", y, "for the month =", m, ", during the week between days", \ fd, "and", ed, "the mean flow is =", flow_mean3) print("") # %% # Quantiles # historical quantiles, without the year 2020. flow_quants1 = np.quantile(flow_data[(flow_data[:,0] != 2020) & \ (flow_data[:,1] == m) & (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3],\ q=[0,0.33,0.5,0.66,1.0]) # Quantiles since the year 2009 (the last 10 years), without the year 2020. flow_quants2 = np.quantile(flow_data[(flow_data[:,0] != 2020) & (flow_data[:,0] >= 2009) &\ (flow_data[:,1] == m) & (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3],\ q=[0,0.33,0.5,0.66,1.0]) # Quantiles for a specific year. flow_quants3 = np.quantile(flow_data[(flow_data[:,0] == y) & (flow_data[:,1] == m) &\ (flow_data[:,2] >= fd) & (flow_data[:,2] <= ed),3], q=[0,0.33,0.5,0.66,1.0]) print("All years, month =", m, "week between days:", fd, "and", ed) print("min, 33%, median, 66%, max") print(flow_quants1) print("")
def quartile(data): q1 = np.quantile(data, .25) q2 = np.quantile(data, .50) q3 = np.quantile(data, .75) return q1, q2, q3
] # Show sent_topics_sorteddf_mallet.head(10) # In[ ]: doc_lens = [len(d) for d in df_dominant_topic.Text] # Plot plt.figure(figsize=(16, 7), dpi=160) plt.hist(doc_lens, bins=1000, color='navy') plt.text(750, 100, "Mean : " + str(round(np.mean(doc_lens)))) plt.text(750, 90, "Median : " + str(round(np.median(doc_lens)))) plt.text(750, 80, "Stdev : " + str(round(np.std(doc_lens)))) plt.text(750, 70, "1%ile : " + str(round(np.quantile(doc_lens, q=0.01)))) plt.text(750, 60, "99%ile : " + str(round(np.quantile(doc_lens, q=0.99)))) plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count') plt.tick_params(size=16) plt.xticks(np.linspace(0, 1000, 9)) plt.title('Distribution of Document Word Counts', fontdict=dict(size=22)) plt.show() # In[ ]: import seaborn as sns import matplotlib.colors as mcolors cols = [color for name, color in mcolors.TABLEAU_COLORS.items()
def plot_trajectories(exper_dir, events, embeddings, word, word_step, font_size): embs_dir = os.path.join(exper_dir, 'embs') tsne_output = os.path.join(exper_dir, 'visualization') vocabulary = os.path.join(embs_dir, 'wordIDHash.csv') wordlist = [] fid = open(vocabulary, 'r') for line in fid: word_id, _word = line.strip().split(',') wordlist.append(_word) fid.close() word2Id = {} for k in range(len(wordlist)): word2Id[wordlist[k]] = k times = get_points( embs_dir) # total number of time points (20/range(27) for ngram/nyt) emb_all = sio.loadmat(embeddings) emb = emb_all[f'U_{times[-1]}'] nn = emb.shape[1] X = [] list_of_words = [] isword = [] words_by_period = {} for year in times: emb = emb_all[f'U_{year}'] embnrm = np.reshape(np.sqrt(np.sum(emb**2, 1)), (emb.shape[0], 1)) emb_normalized = np.divide(emb, np.tile(embnrm, (1, emb.shape[1]))) print(emb_normalized.shape) v = emb_normalized[word2Id[word], :] d = np.dot(emb_normalized, v) idx = np.argsort(d)[::-1] newwords = [(wordlist[k], year) for k in list(idx[:nn])] print(newwords) list_of_words.extend(newwords) words_by_period[year] = list(map(lambda word: word[0], newwords)) for k in range(nn): isword.append(k == 0) X.append(emb[idx[:nn], :]) X = np.vstack(X) print(X.shape) import matplotlib.pyplot as plt import pickle import umap model = umap.UMAP(n_neighbors=10, min_dist=0.75, metric='cosine', random_state=1) Z = model.fit_transform(X) traj_fig, traj_ax = plt.subplots(1, 1) traj = [] target_indexes = [] not_target_indexes = [] sum_of_coor = dict() for k in range(len(list_of_words)): k_word = list_of_words[k][0] # e.g.: guayaquil period = list_of_words[k][ 1] # e.g.: 0 if first week, 1 if second week, etc. if isword[k]: target_indexes.append(k) marker = 's' color = 'red' if period in events else 'dodgerblue' traj.append(Z[k, :]) traj_ax.plot(Z[k, 0], Z[k, 1], marker, color=color, markersize=7) # plot only a few labels for clarity if period % word_step == 0 or period == times[-1]: traj_ax.text(Z[k, 0], Z[k, 1], f'{k_word}::{period}', fontsize=font_size) else: traj_ax.text(Z[k, 0], Z[k, 1], f'{period}', fontsize=font_size) else: not_target_indexes.append(k) sum_of_coor[k_word] = sum_of_coor.get(k_word, np.zeros(2)) sum_of_coor[k_word] += Z[k] distances = [] for i in target_indexes: differences = Z[not_target_indexes] - Z[i] distances.extend(np.linalg.norm(differences, axis=1)) dist_threshold = np.quantile(distances, 0.95) print('==', dist_threshold) def plot_word(word_index, k_word, list_of_words): period = list_of_words[word_index][ 1] # e.g.: 0 if first week, 1 if second week, etc. traj_ax.plot(Z[word_index, 0], Z[word_index, 1], 'o', color='mediumseagreen') traj_ax.text(Z[word_index, 0], Z[word_index, 1], f'{k_word}::{period}', fontsize=font_size) plot_indexes = set() plot_words = dict() for i in target_indexes: differences = Z[not_target_indexes] - Z[i] distances = np.linalg.norm(differences, axis=1) closest = sorted(zip(distances, not_target_indexes)) top_threshold = 20 for distance, word_index in closest[:top_threshold]: if distance < dist_threshold and not word_index in plot_indexes: k_word = list_of_words[word_index][0] # e.g.: guayaquil if plot_words.get(k_word) is None: plot_word(word_index, k_word, list_of_words) plot_indexes.add(word_index) plot_words[k_word] = np.array([Z[word_index]]) else: differences = plot_words[k_word] - Z[word_index] distances = np.linalg.norm(differences, axis=1) if distances[distances < 1].shape[0] == 0: plot_word(word_index, k_word, list_of_words) plot_indexes.add(word_index) plot_words[k_word] = np.append(plot_words[k_word], [Z[word_index]], axis=0) traj = np.vstack(traj) traj_ax.plot(traj[:, 0], traj[:, 1], linewidth=2) plt.show() def get_semantic_change(vectors, metric): distances = [] for i in range(1, vectors.shape[0]): if metric == 'euclidean': distance = np.linalg.norm(vectors[i] - vectors[i - 1]) elif metric == 'cosine': distance = scipy.spatial.distance.cosine( vectors[i], vectors[i - 1]) distances.append(distance) return distances def plot_semantic_change(data): fig, ax = plt.subplots(1, 1, figsize=(15, 5)) ax.plot(data) # ax.plot(acum_distances) ax.set_ylabel('Distancia') ax.set_xlabel('Semana') ax.legend( ['Distancia entre semanas', 'Distancia entre semanas acumulada']) plt.show() change_2d = get_semantic_change(traj, 'euclidean') change_50d = get_semantic_change(X[target_indexes], 'euclidean') change_50d_cosine = get_semantic_change(X[target_indexes], 'cosine') plot_semantic_change(change_2d) plot_semantic_change(change_50d) plot_semantic_change(change_50d_cosine) target_word_dir = os.path.join(tsne_output, word) if not os.path.isdir(target_word_dir): os.makedirs(target_word_dir) sio.savemat(os.path.join(target_word_dir, 'embs.mat'), {'emb': Z}) pickle.dump({ 'words': list_of_words, 'isword': isword }, open(os.path.join(target_word_dir, 'wordlist.pkl'), 'wb')) for period, context_words in words_by_period.items(): lines = [] for context_word in context_words: lines.append(f'{word2Id[context_word]},{context_word}\n') with open( os.path.join(target_word_dir, f'closer2{word}_week_{period}.csv'), 'w') as file: file.writelines(lines) allwords = ['art', 'damn', 'gay', 'hell', 'maid', 'muslim'] import matplotlib.pyplot as plt import pickle Z = sio.loadmat(os.path.join(target_word_dir, 'embs.mat'))['emb'] data = pickle.load( open(os.path.join(target_word_dir, 'wordlist.pkl'), 'rb')) list_of_words, isword = data['words'], data['isword'] plt.clf() traj = [] Zp = Z * 1. Zp[:, 0] = Zp[:, 0] * 2. all_dist = np.zeros((Z.shape[0], Z.shape[0])) for k in range(Z.shape[0]): all_dist[:, k] = np.sum((Zp - np.tile(Zp[k, :], (Z.shape[0], 1)))**2., axis=1) dist_to_centerpoints = all_dist[:, isword] dist_to_centerpoints = np.min(dist_to_centerpoints, axis=1) dist_to_other = all_dist + np.eye(Z.shape[0]) * 1000. idx_dist_to_other = np.argsort(dist_to_other, axis=1) dist_to_other = np.sort(dist_to_other, axis=1) plt.clf() for k in range(len(list_of_words) - 1, -1, -1): if isword[k]: #if list_of_words[k][1] % 3 != 0 and list_of_words[k][1] < 199 : continue marker = 'bo' traj.append(Z[k, :]) plt.plot(Z[k, 0], Z[k, 1], marker) else: if dist_to_centerpoints[k] > 200: continue skip = False for i in range(Z.shape[0]): if dist_to_other[k, i] < 150 and idx_dist_to_other[k, i] > k: skip = True break if dist_to_other[k, i] >= 150: break if skip: continue if Z[k, 0] > 8: continue plt.plot(Z[k, 0], Z[k, 1]) plt.text(Z[k, 0] - 2, Z[k, 1] + np.random.randn() * 2, ' %s-%d' % (list_of_words[k][0], list_of_words[k][1] * 10)) plt.axis('off') traj = np.vstack(traj) plt.plot(traj[:, 0], traj[:, 1]) plt.show()
def upper_absolute_credible_interval(self): """ Absolute upper value of the credible interval """ return np.quantile(self.samples, self._upper_level, axis=0)
def _fit_biases(X, dilations, num_features_per_dilation, quantiles, seed): if seed is not None: np.random.seed(seed) n_instances, n_timepoints = X.shape # equivalent to: # >>> from itertools import combinations # >>> indices = np.array([_ for _ in combinations(np.arange(9), 3)]) indices = np.array( ( 0, 1, 2, 0, 1, 3, 0, 1, 4, 0, 1, 5, 0, 1, 6, 0, 1, 7, 0, 1, 8, 0, 2, 3, 0, 2, 4, 0, 2, 5, 0, 2, 6, 0, 2, 7, 0, 2, 8, 0, 3, 4, 0, 3, 5, 0, 3, 6, 0, 3, 7, 0, 3, 8, 0, 4, 5, 0, 4, 6, 0, 4, 7, 0, 4, 8, 0, 5, 6, 0, 5, 7, 0, 5, 8, 0, 6, 7, 0, 6, 8, 0, 7, 8, 1, 2, 3, 1, 2, 4, 1, 2, 5, 1, 2, 6, 1, 2, 7, 1, 2, 8, 1, 3, 4, 1, 3, 5, 1, 3, 6, 1, 3, 7, 1, 3, 8, 1, 4, 5, 1, 4, 6, 1, 4, 7, 1, 4, 8, 1, 5, 6, 1, 5, 7, 1, 5, 8, 1, 6, 7, 1, 6, 8, 1, 7, 8, 2, 3, 4, 2, 3, 5, 2, 3, 6, 2, 3, 7, 2, 3, 8, 2, 4, 5, 2, 4, 6, 2, 4, 7, 2, 4, 8, 2, 5, 6, 2, 5, 7, 2, 5, 8, 2, 6, 7, 2, 6, 8, 2, 7, 8, 3, 4, 5, 3, 4, 6, 3, 4, 7, 3, 4, 8, 3, 5, 6, 3, 5, 7, 3, 5, 8, 3, 6, 7, 3, 6, 8, 3, 7, 8, 4, 5, 6, 4, 5, 7, 4, 5, 8, 4, 6, 7, 4, 6, 8, 4, 7, 8, 5, 6, 7, 5, 6, 8, 5, 7, 8, 6, 7, 8, ), dtype=np.int32, ).reshape(84, 3) num_kernels = len(indices) num_dilations = len(dilations) num_features = num_kernels * np.sum(num_features_per_dilation) biases = np.zeros(num_features, dtype=np.float32) feature_index_start = 0 for dilation_index in range(num_dilations): dilation = dilations[dilation_index] padding = ((9 - 1) * dilation) // 2 num_features_this_dilation = num_features_per_dilation[dilation_index] for kernel_index in range(num_kernels): feature_index_end = feature_index_start + num_features_this_dilation _X = X[np.random.randint(n_instances)] A = -_X # A = alpha * X = -X G = _X + _X + _X # G = gamma * X = 3X C_alpha = np.zeros(n_timepoints, dtype=np.float32) C_alpha[:] = A C_gamma = np.zeros((9, n_timepoints), dtype=np.float32) C_gamma[9 // 2] = G start = dilation end = n_timepoints - padding for gamma_index in range(9 // 2): C_alpha[-end:] = C_alpha[-end:] + A[:end] C_gamma[gamma_index, -end:] = G[:end] end += dilation for gamma_index in range(9 // 2 + 1, 9): C_alpha[:-start] = C_alpha[:-start] + A[start:] C_gamma[gamma_index, :-start] = G[start:] start += dilation index_0, index_1, index_2 = indices[kernel_index] C = C_alpha + C_gamma[index_0] + C_gamma[index_1] + C_gamma[index_2] biases[feature_index_start:feature_index_end] = np.quantile( C, quantiles[feature_index_start:feature_index_end]) feature_index_start = feature_index_end return biases
# Medidas de centralidade salario_jogadores = [ 40000, 18000, 12000, 250000, 30000, 140000, 300000, 40000, 800000 ] np.mean(salario_jogadores) # média np.std(salario_jogadores, ddof=1) # desvio-padrão np.var(salario_jogadores) # variância np.median(salario_jogadores) # mediana np.quantile(salario_jogadores, [0, .25, .50, .75, 1]) # quartis # Criando duas funções de amostragem ''' Criando uma função que pega um data.frame e retorna casos aleatórios de acordo com o número desejado do novo n. Se N / n < 1, a função retorna os primeiros n casos de data.frame. ''' def sortearESeguir(df, amostra): k = int(len(df) / amostra) random_n = np.random.randint(low=1, high=k + 1, size=1) acumulador = random_n[0] sorteados = [] for i in range(amostra):
bayes_mean_stat_max = np.array([s[0].minmax[1] for s in bcv]) bayes_std_stat = np.array([s[2].statistic for s in bcv]) bayes_std_stat_min = np.array([s[2].minmax[0] for s in bcv]) bayes_std_stat_max = np.array([s[2].minmax[1] for s in bcv]) ## estimating data median to get a left and right side X% interval interval = 0.8 peak_diams_mean = np.zeros(fit_data.shape[0]) lower_diams_mean = np.zeros(fit_data.shape[0]) upper_diams_mean = np.zeros(fit_data.shape[0]) for i in range(fit_data.shape[0]): peak_diams_mean[i] = bayes_mean_stat[i] lower_diams_mean[i] = np.quantile( fit_data[i][fit_data[i] <= bayes_mean_stat[i]], 1 - interval) upper_diams_mean[i] = np.quantile( fit_data[i][fit_data[i] >= bayes_mean_stat[i]], interval) dpi = 100 pl.figure(figsize=(10, 10), dpi=dpi) jitter_intensity = 0.5 step = (diams[1:] - diams[:-1]).mean() jitter = (0.5 - np.random.rand( Ntrial * diams.shape[0])) * step * jitter_intensity pl.scatter((np.repeat(diams, Ntrial) + jitter) * 1e6, fit_data.ravel() * 1e6, color='red', alpha=0.01, edgecolors="none")
def evaluate(prediction_path, country, drawing_area, covid_stats): pred_image = Image.open(prediction_path) country_image = Image.open(f"{constants.IMAGES_PATH}/{country}.jpg") if not pred_image.size == country_image.size: logging.error( "The size of the submitted image is not equal to the original size." ) return "The size of the submitted image is not equal to the original size. Please try again." size = pred_image.size[1], pred_image.size[0] pred_data = np.sum(np.array(pred_image.getdata()), axis=1).reshape(size) country_data = np.sum(np.array(country_image.getdata()), axis=1).reshape(size) x0, y0, x1, y1, x_factor, y_factor = drawing_area diff = np.abs(pred_data - country_data).T[int(x0):int(x1), int(y0):int(y1)] x_offset = x0 % 1 y_offset = y0 % 1 line_pixels = [] for row in diff: if np.max(row) < 150: line_pixels.append(np.array([])) else: line_pixels.append( np.argwhere(row >= np.max(row) * LINE_THRESHOLD)) for i in line_pixels: if len(i): break else: logging.error("No line was found.") return "No line was found. Please try again." thicknesses = [] for column in line_pixels: if len(column) > 1: thicknesses.append(max(column) - min(column)) line_thickness = np.quantile(thicknesses, 0.2) line = [] for row in line_pixels: if not len(row): line.append(float("nan")) else: line.append(np.min(row) + line_thickness / 2) data = covid_stats.get("date", "new_cases_smoothed", location=country) for i in range(1, 4): try: last_date, last_value = date2num(data[-i][0]), float(data[-i][1]) break except ValueError: logging.error(f"No data for {country} available (attempt {i}).") else: raise ValueError(f"There is no readable data for {country}.") raw_predictions = dict() raw_predictions[date2num(datetime.date.today())] = last_value last = None for i, point in enumerate(line): if not np.isnan(point): cases = (y1 - y0 - y_offset - point) * y_factor if cases < 0: cases = 0 last = raw_predictions[date2num(datetime.date.today()) + (x_offset + i) * x_factor] = cases if not line or last is None: return "No line was found. Please try again." raw_predictions[date2num(datetime.date.today() + datetime.timedelta( days=charts.N_PREDICTED_DAYS))] = last return raw_predictions
with open(path_members) as file_member: for line in file_member: fields = line.rstrip('\n').split('\t') members.append(fields[1]) # Open alignments and calculate statistics means = [] stds = [] iqrs = [] for member in members: with gzip.open(dir_msa + member + '.raw_alg.faa.gz', 'rt') as file: MSA = AlignIO.read(file, 'fasta') fractions = fraction_ungapped(MSA) means.append(stats.tmean(fractions)) stds.append(stats.tstd(fractions)) iqrs.append(quantile(fractions, 0.75) - quantile(fractions, 0.25)) # Save statistics to folder root, _ = os.path.splitext(os.path.basename( path_members)) # Get name of member file without extension if not os.path.exists('out/' + root): os.makedirs('out/' + root) # Recursive folder creation with open('out/' + root + '/means.json', 'w') as file: json.dump(means, file) with open('out/' + root + '/stds.json', 'w') as file: json.dump(stds, file) with open('out/' + root + '/iqrs.json', 'w') as file: json.dump(iqrs, file) """ DEPENDENCIES
for i in range(len(region_proposals)): start, end, score = int(region_proposals[i][0]), int( region_proposals[i][1]), region_proposals[i][2] if np.isnan(score): print("i=", i, "score = ", score) anomaly_time_scores[start:end] += score / np.power(end - start, -0.2) anomaly_time_weights[start:end] += 1 / np.power(end - start, -0.2) anomaly_time_scores_aver = anomaly_time_scores / anomaly_time_weights print("np.corrcoeff time score:", np.corrcoef(anomaly_time_scores_aver, anomaly_level)) np.save(args.output_score + ".npy", anomaly_time_scores_aver) max_f1 = 0 for pred_th in np.linspace(np.quantile(anomaly_time_scores_aver, 0.8), np.quantile(anomaly_time_scores_aver, 0.99), 200): res = eval_measure(anomaly_level, anomaly_time_scores_aver, test_th=0.5, pred_th=pred_th) if res[2] > max_f1: max_f1 = res[2] print("for pred_th = ", pred_th, "res = ", res) print("max f1: ", max_f1) plt.figure(figsize=(20, 10)) range2 = np.arange(0, data_attack.shape[0])
def run(self): print('_____>>>', self.filepath) if str(self.filepath).endswith('.h5'): print('loading from .h5') file = h5py.File(self.filepath, 'r') print('UUUU', file.keys()) data_esr_norm = file['esr_map'] self.frequencies = file['frequency'] # print('loading freq from data_subscripts') # # sub_fs = glob.glob(os.path.join(os.path.dirname(self.filepath), 'data_subscripts/*')) # print('sssss', sub_fs) # # print('ASAAAA', sub_fs[0]) # # # f = glob.glob(os.path.join(os.path.dirname(self.filepath), 'data_subscripts/*'))[0] # data = Script.load_data(f) # self.frequencies = data['frequency'] else: print('loading from data_subscripts') data_esr = [] for f in sorted(glob.glob(os.path.join(self.filepath, './data_subscripts/*'))): data = Script.load_data(f) data_esr.append(data['data']) self.frequencies = data['frequency'] # normalize norm = 'quantile' norm_parameter = 0.75 if norm == 'mean': norm_value = [np.mean(d) for d in data_esr] elif norm == 'border': if norm_parameter > 0: norm_value = [np.mean(d[0:norm_parameter]) for d in data_esr] elif norm_parameter < 0: norm_value = [np.mean(d[norm_parameter:]) for d in data_esr] elif norm == 'quantile': norm_value = [np.quantile(d, norm_parameter) for d in data_esr] data_esr_norm = np.array([d / n for d, n in zip(data_esr, norm_value)]) # normalize and convert to numpy array # data_esr_norm = [] # for d in data_esr: # data_esr_norm.append(d / np.mean(d)) angle = np.arange(len(data_esr_norm)) print('<<<<<<<', self.frequencies.shape, angle.shape, data_esr_norm.shape) self.x_range = list(range(0, len(data_esr_norm))) self.status.emit('executing manual fitting') index = 0 # for data in data_array: while index < self.NUM_ESR_LINES: #this must be after the draw command, otherwise plot doesn't display for some reason self.status.emit('executing manual fitting NV #' + str(index)) self.plotwidget.axes.clear() self.plotwidget.axes.pcolor(self.frequencies, angle, data_esr_norm) # self.plotwidget.axes.imshow(data_esr_norm, aspect = 'auto', origin = 'lower') if self.interps: for f in self.interps: self.plotwidget.axes.plot(f(self.x_range), self.x_range) self.plotwidget.draw() while(True): if self.queue.empty(): time.sleep(.5) else: value = self.queue.get() if value == 'next': while not self.peak_vals == []: self.peak_vals.pop(-1) # if len(self.single_fit) == 1: # self.fits[index] = self.single_fit # else: # self.fits[index] = [y for x in self.single_fit for y in x] index += 1 self.interps.append(f) break elif value == 'clear': self.plotwidget.axes.clear() self.plotwidget.axes.imshow(data_esr_norm, aspect='auto', origin = 'lower') if self.interps: for f in self.interps: self.plotwidget.axes.plot(f(self.x_range), self.x_range) self.plotwidget.draw() elif value == 'fit': # peak_vals = sorted(self.peak_vals, key=lambda tup: tup[1]) peak_vals = np.array(self.peak_vals) print('ggggg', peak_vals.shape) y, x = peak_vals[:,0], peak_vals[:, 1] # y,x = list(zip(*peak_vals)) # # print('sdasda', x) # # # sort the list such that points are in creasing (in case we accidently clicked below a point) # y = [elem for _, elem in sorted(zip(x, y))] y = y[x.argsort()] x = sorted(x) f = UnivariateSpline(x, y) x_range = list(range(0,len(data_esr_norm))) self.plotwidget.axes.plot(f(x_range), x_range) self.plotwidget.draw() elif value == 'prev': index -= 1 break elif value == 'skip': index += 1 break elif type(value) is int: index = int(value) break self.finished.emit() self.status.emit('saving') self.plotwidget.axes.clear() angle = np.arange(len(data_esr_norm)) # print('asdadf', self.frequencies) self.plotwidget.axes.pcolor(self.frequencies, angle, data_esr_norm) # self.plotwidget.axes.imshow(data_esr_norm, aspect='auto', origin = 'lower') if self.interps: for f in self.interps: self.plotwidget.axes.plot(f(self.x_range), self.x_range) self.save() self.status.emit('saving finished')
def create_features(seg_id, seg, X): xc = pd.Series(seg['acoustic_data'].values) zc = np.fft.fft(xc) X.loc[seg_id, 'mean'] = xc.mean() X.loc[seg_id, 'std'] = xc.std() X.loc[seg_id, 'max'] = xc.max() X.loc[seg_id, 'min'] = xc.min() # FFT transform values realFFT = np.real(zc) imagFFT = np.imag(zc) X.loc[seg_id, 'Rmean'] = realFFT.mean() X.loc[seg_id, 'Rstd'] = realFFT.std() X.loc[seg_id, 'Rmax'] = realFFT.max() X.loc[seg_id, 'Rmin'] = realFFT.min() X.loc[seg_id, 'Imean'] = imagFFT.mean() X.loc[seg_id, 'Istd'] = imagFFT.std() X.loc[seg_id, 'Imax'] = imagFFT.max() X.loc[seg_id, 'Imin'] = imagFFT.min() X.loc[seg_id, 'Rmean_last_5000'] = realFFT[-5000:].mean() X.loc[seg_id, 'Rstd__last_5000'] = realFFT[-5000:].std() X.loc[seg_id, 'Rmax_last_5000'] = realFFT[-5000:].max() X.loc[seg_id, 'Rmin_last_5000'] = realFFT[-5000:].min() X.loc[seg_id, 'Rmean_last_15000'] = realFFT[-15000:].mean() X.loc[seg_id, 'Rstd_last_15000'] = realFFT[-15000:].std() X.loc[seg_id, 'Rmax_last_15000'] = realFFT[-15000:].max() X.loc[seg_id, 'Rmin_last_15000'] = realFFT[-15000:].min() X.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(xc)) X.loc[seg_id, 'mean_change_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0]) X.loc[seg_id, 'abs_max'] = np.abs(xc).max() X.loc[seg_id, 'abs_min'] = np.abs(xc).min() X.loc[seg_id, 'std_first_50000'] = xc[:50000].std() X.loc[seg_id, 'std_last_50000'] = xc[-50000:].std() X.loc[seg_id, 'std_first_10000'] = xc[:10000].std() X.loc[seg_id, 'std_last_10000'] = xc[-10000:].std() X.loc[seg_id, 'avg_first_50000'] = xc[:50000].mean() X.loc[seg_id, 'avg_last_50000'] = xc[-50000:].mean() X.loc[seg_id, 'avg_first_10000'] = xc[:10000].mean() X.loc[seg_id, 'avg_last_10000'] = xc[-10000:].mean() X.loc[seg_id, 'min_first_50000'] = xc[:50000].min() X.loc[seg_id, 'min_last_50000'] = xc[-50000:].min() X.loc[seg_id, 'min_first_10000'] = xc[:10000].min() X.loc[seg_id, 'min_last_10000'] = xc[-10000:].min() X.loc[seg_id, 'max_first_50000'] = xc[:50000].max() X.loc[seg_id, 'max_last_50000'] = xc[-50000:].max() X.loc[seg_id, 'max_first_10000'] = xc[:10000].max() X.loc[seg_id, 'max_last_10000'] = xc[-10000:].max() X.loc[seg_id, 'max_to_min'] = xc.max() / np.abs(xc.min()) X.loc[seg_id, 'max_to_min_diff'] = xc.max() - np.abs(xc.min()) X.loc[seg_id, 'count_big'] = len(xc[np.abs(xc) > 500]) X.loc[seg_id, 'sum'] = xc.sum() X.loc[seg_id, 'mean_change_rate_first_50000'] = np.mean(np.nonzero((np.diff(xc[:50000]) / xc[:50000][:-1]))[0]) X.loc[seg_id, 'mean_change_rate_last_50000'] = np.mean(np.nonzero((np.diff(xc[-50000:]) / xc[-50000:][:-1]))[0]) X.loc[seg_id, 'mean_change_rate_first_10000'] = np.mean(np.nonzero((np.diff(xc[:10000]) / xc[:10000][:-1]))[0]) X.loc[seg_id, 'mean_change_rate_last_10000'] = np.mean(np.nonzero((np.diff(xc[-10000:]) / xc[-10000:][:-1]))[0]) X.loc[seg_id, 'q95'] = np.quantile(xc, 0.95) X.loc[seg_id, 'q99'] = np.quantile(xc, 0.99) X.loc[seg_id, 'q05'] = np.quantile(xc, 0.05) X.loc[seg_id, 'q01'] = np.quantile(xc, 0.01) X.loc[seg_id, 'abs_q95'] = np.quantile(np.abs(xc), 0.95) X.loc[seg_id, 'abs_q99'] = np.quantile(np.abs(xc), 0.99) X.loc[seg_id, 'abs_q05'] = np.quantile(np.abs(xc), 0.05) X.loc[seg_id, 'abs_q01'] = np.quantile(np.abs(xc), 0.01) X.loc[seg_id, 'trend'] = add_trend_feature(xc) X.loc[seg_id, 'abs_trend'] = add_trend_feature(xc, abs_values=True) X.loc[seg_id, 'abs_mean'] = np.abs(xc).mean() X.loc[seg_id, 'abs_std'] = np.abs(xc).std() X.loc[seg_id, 'mad'] = xc.mad() X.loc[seg_id, 'kurt'] = xc.kurtosis() X.loc[seg_id, 'skew'] = xc.skew() X.loc[seg_id, 'med'] = xc.median() X.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean() X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean() X.loc[seg_id, 'classic_sta_lta1_mean'] = classic_sta_lta(xc, 500, 10000).mean() X.loc[seg_id, 'classic_sta_lta2_mean'] = classic_sta_lta(xc, 5000, 100000).mean() X.loc[seg_id, 'classic_sta_lta3_mean'] = classic_sta_lta(xc, 3333, 6666).mean() X.loc[seg_id, 'classic_sta_lta4_mean'] = classic_sta_lta(xc, 10000, 25000).mean() X.loc[seg_id, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True) X.loc[seg_id, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True) ewma = pd.Series.ewm X.loc[seg_id, 'exp_Moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True) X.loc[seg_id, 'exp_Moving_average_30000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True) no_of_std = 2 X.loc[seg_id, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean() X.loc[seg_id, 'MA_700MA_BB_high_mean'] = ( X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean() X.loc[seg_id, 'MA_700MA_BB_low_mean'] = ( X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean() X.loc[seg_id, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean() X.loc[seg_id, 'MA_400MA_BB_high_mean'] = ( X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean() X.loc[seg_id, 'MA_400MA_BB_low_mean'] = ( X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean() X.loc[seg_id, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean() X.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(xc, [75, 25])) X.loc[seg_id, 'q999'] = np.quantile(xc, 0.999) X.loc[seg_id, 'q001'] = np.quantile(xc, 0.001) X.loc[seg_id, 'ave10'] = stats.trim_mean(xc, 0.1) for windows in [10, 100, 1000]: x_roll_std = xc.rolling(windows).std().dropna().values x_roll_mean = xc.rolling(windows).mean().dropna().values X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean() X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std() X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max() X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min() X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01) X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05) X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95) X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99) X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std)) X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = np.mean( np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]) X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max() X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean() X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std() X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max() X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min() X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01) X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05) X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95) X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99) X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean)) X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean( np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]) X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()
r.close() k.close() return(None) if __name__ == "__main__": quart = args.groups treelist = LoadTrees(args.treefile, quart, args.outgroup, args.dlm) WriteTrees(treelist) taxdict = AgeAndSupport(treelist, quart) AgeStats(taxdict, quart) SupportStats(taxdict, quart) root_height, sum_support = FilterTree(treelist) distlist, splist = pairwiseDistance(treelist, quart) l = [i[0] for i in distlist] print("{} {}-{}".format(np.mean(l), np.quantile(l,.05),np.quantile(l,.95))) if args.windows: WindowStats(args.windows, taxdict, quart, root_height, sum_support, splist, distlist) ##test tree #t='(rivulorum_F790:0.1188,(((longipalpusC_551_12634:6e-09,(longipalpusC_13:6e-09,(longipalpusC_16:6e-09,longipalpusC_551_12533:6e-09)0.921:6e-09)0.909:5e-09)0.014:0.00222105,(longipalpusC_15:6e-09,(longipalpusC_12:6e-09,(longipalpusC_11:0.00112065,(longipalpusC_4:6e-09,(parensis_KwaF762:5e-09,((parensis_KwaF761:6e-09,parensis_KwaF766:6e-09)0.92:6e-09,(parensis_KwaF767:0,parensis_KwaF768:0,parensis_KwaF769:0,parensis_KwaF835:0,parensis_KwaF851:0)1:6e-09)0:6e-09)0.583:6e-09)0.292:0.00110003)0:2.27e-07)0:5e-09)0.711:2.305e-05)0.955:0.00882347,(((((vaneedeni_KwaF782:6e-09,vaneedeni_KwaF780:6e-09)0.921:6e-09,vaneedeni_KwaF774:6e-09)0:6e-09,(vaneedeni_KwaF784:6e-09,vaneedeni_KwaF783:6e-09)0.767:6e-09)0.889:5e-09,(vaneedeni_KwaF775:6e-09,(vaneedeni_KwaF773:6e-09,vaneedeni_KwaF786:0.00112541)0.367:5e-09)1:7.08e-07)0.995:0.0102093,((funestuscf_MALAF105_7:0,funestuscf_MALAF99_4:0,funestuscf_MALF98_2:0)1:0.00447164,((funestus_MozF123:6e-09,((((funestus_MozF35:0,funestus_MozF804:0,funestus_Zam281:0)1:6e-09,funestus_TanF561:6e-09)0.936:5e-09,funestus_TanF601:6e-09)0.395:6e-09,funestus_MozF29:6e-09)0.405:0.00334382)0.646:6e-09,(funestus_GhaF264:6e-09,(funestus_Ken4590:6e-09,(funestus_GhaF265:6e-09,(funestus_Ugf399:6e-09,(funestus_Ugf403:6e-09,(funestus_MozF260:6e-09,funestus_Ugf401:6e-09)0.731:5e-09)0.85:6e-09)0.133:0.00222583)0.459:6e-09)0:5e-09)0.453:6e-09)0.894:0.00222803)0.789:6e-09)0.726:0.00356974)1:0.1188);' #tree = PhyloTree(t) #tree.set_species_naming_function(lambda node: node.name.split("_")[0]) #tree.set_outgroup( tree&'rivulorum_F790') ## ##tree.check_monophyly(["longipalpusC"], target_attr="species") ### 0 is bool ### 2 is problem nodes ##tree.get_monophyletic(values=["longipalpusC"], target_attr="species") #tree.remove_child(child) #tree.prune(nodes, preserve_branch_length=True)
def evaluate( self, iter_unit, num_iter, batch_size, warmup_steps=50, log_every_n_steps=1, is_benchmark=False, export_dir=None, ): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for evaluation!') if hvd_utils.is_using_hvd() and hvd.rank() != 0: raise RuntimeError('Multi-GPU inference is not supported') estimator_params = {} image_classifier = self._get_estimator( mode='validation', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="validation", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=batch_size, ) else: num_epochs = 1 num_decay_steps = -1 num_steps = num_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="validation") eval_hooks = [] if hvd.rank() == 0: self.eval_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps) eval_hooks.append(self.eval_logging_hook) print('Starting Model Evaluation...') print("Evaluation Epochs", num_epochs) print("Evaluation Steps", num_steps) print("Decay Steps", num_decay_steps) print("Global Batch Size", batch_size) def evaluation_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: print("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: eval_results = image_classifier.evaluate( input_fn=evaluation_data_fn, steps=num_steps, hooks=eval_hooks, ) eval_throughput = self.eval_logging_hook.mean_throughput.value() eval_latencies = np.array(self.eval_logging_hook.latencies) * 1000 eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99]) eval_latencies_mean = np.mean(eval_latencies) dllogger.log(data={ 'top1_accuracy': float(eval_results['top1_accuracy']), 'top5_accuracy': float(eval_results['top5_accuracy']), 'eval_throughput': eval_throughput, 'eval_latency_avg': eval_latencies_mean, 'eval_latency_p90': eval_latencies_q[0], 'eval_latency_p95': eval_latencies_q[1], 'eval_latency_p99': eval_latencies_q[2], }, step=tuple()) if export_dir is not None: dllogger.log(data={'export_dir': export_dir}, step=tuple()) input_receiver_fn = data_utils.get_serving_input_receiver_fn( batch_size=None, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, dtype=self.run_hparams.dtype) image_classifier.export_savedmodel(export_dir, input_receiver_fn) except KeyboardInterrupt: print("Keyboard interrupt") print('Model evaluation finished')
def _uniform_sampler_(self, data, size, ax=-1): shape = np.mean(data, ax).shape + (size, ) return lambda: np.quantile(data, 0.1, axis=-1)[ ..., None] * np.random.rand(*list(shape)) + np.quantile( data, 0.9, axis=-1)[..., None]
geo_location_map_lat= geo_location_zip['Latitude'] geo_location_map_lat= geo_location_map_lat.to_dict() geo_location_map_long = geo_location_zip['Longitude'] geo_location_map_long = geo_location_map_long.to_dict() nyc_data['Longitude'] = nyc_data['ZIP CODE'].map(geo_location_map_long) nyc_data['Latitude'] = nyc_data['ZIP CODE'].map(geo_location_map_lat) # for visualization nyc_data['BOROUGH'][nyc_data['BOROUGH']==1]='Manhattan' nyc_data['BOROUGH'][nyc_data['BOROUGH']==2]='Bronx' nyc_data['BOROUGH'][nyc_data['BOROUGH']==3]='Brooklyn' nyc_data['BOROUGH'][nyc_data['BOROUGH']==4]='Queens' nyc_data['BOROUGH'][nyc_data['BOROUGH']==5]='Staten Island' # create bins bins = [np.quantile(nyc_data['SALE PRICE'],0.2),np.quantile(nyc_data['SALE PRICE'],0.4), np.quantile(nyc_data['SALE PRICE'],0.5), np.quantile(nyc_data['SALE PRICE'],0.6), np.quantile(nyc_data['SALE PRICE'],0.80), np.quantile(nyc_data['SALE PRICE'],1)] labels =['Very Low','Low', 'Medium', 'High', 'Very High'] nyc_data['SALE_PRICE_BIN'] = pd.cut(nyc_data['SALE PRICE'], labels=labels, bins =bins,include_lowest=False) # visualization plt.style.use('ggplot') f, (ax1, ax2)= plt.subplots(2, figsize = [12,12]) fig1 = sns.scatterplot(x = 'Longitude', y = 'Latitude', hue = 'BOROUGH', style = 'BOROUGH',data=nyc_data,ax=ax1) fig1.set_title('GEO REAL ESTATE MAP BY DISTRICT') fig1.legend(loc = 'upper left') fig2 = sns.scatterplot(x = 'Longitude', y = 'Latitude', hue = 'SALE_PRICE_BIN', style = 'SALE_PRICE_BIN',data=nyc_data, ax= ax2) fig2.set_title('GEO REAL ESTATE MAP BY PRICE') fig2.legend(loc = 'upper left') plt.show()
def array_quantile_global(arr, q): return np.quantile(arr, q)
if rls: receiveTime = int(rls[0].split()[0]) delay = (receiveTime - sendTime) / 1000000.0 delays.append(delay) if len(delays) != len(receiveLines): print("warning: did not find delay for all packets") if delays: avgdelays.append(np.mean(delays)) if pdrs: print(prot, pccr, ptbi, len(pdrs)) pdrdata[(prot, pccr, ptbi)][0].append(x); pdrdata[(prot, pccr, ptbi)][1].append(np.mean(pdrs)) pdrdata[(prot, pccr, ptbi)][2].append(np.mean(recs)) #pdrdata[(prot, pccr, ptbi)][3].append(2*np.std(pdrs)) pdrdata[(prot, pccr, ptbi)][3].append(np.quantile(pdrs, 0.05)) pdrdata[(prot, pccr, ptbi)][4].append(np.quantile(pdrs, 0.95)) if senderdcs: senderdcdata[(prot, pccr, ptbi)][0].append(x) senderdcdata[(prot, pccr, ptbi)][1].append(np.mean(senderdcs)) #senderdcdata[(prot, pccr, ptbi)][2].append(2*np.std(senderdcs)) senderdcdata[(prot, pccr, ptbi)][2].append(np.quantile(senderdcs, 0.05)) senderdcdata[(prot, pccr, ptbi)][3].append(np.quantile(senderdcs, 0.95)) recvrdcdata[(prot, pccr, ptbi)][0].append(x) recvrdcdata[(prot, pccr, ptbi)][1].append(np.mean(recvrdcs)) #recvrdcdata[(prot, pccr, ptbi)][2].append(2*np.std(recvrdcs)) recvrdcdata[(prot, pccr, ptbi)][2].append(np.quantile(recvrdcs, 0.05)) recvrdcdata[(prot, pccr, ptbi)][3].append(np.quantile(recvrdcs, 0.95)) if avgdelays: