def normalize_transactions(trans): #print (trans) for i in range(len(trans)): trans[i][0] = util.normalize(trans[i][0], amount_stat) trans[i][1] = util.normalize(trans[i][1], date_stat) return trans
def normalize_account(account, bin_sizes): out = np.zeros((np.sum(bin_sizes)), dtype=np.float32) out[0] = util.normalize(account[0], open_stat) out[1 + account[1]] = 1 out[1 + bin_sizes[1]] = util.normalize(account[2], recent_stat) out[2 + bin_sizes[1]] = util.normalize(account[3], dormant_stat) out[3 + bin_sizes[1] + account[4]] = 1 return out
def test_normalize(self): from src.util import normalize v = np.array([[1.0], [2.0], [3.0]]) v_ln = np.sqrt(1.0**2 + 2.0**2 + 3.0**2) expected = np.array([[1.0 / v_ln], [2.0 / v_ln], [3.0 / v_ln]]) self.assertEqual(normalize(v), expected) v = np.array([[0.0], [0.0], [0.0]]) expected = np.array([[0.0], [0.0], [0.0]]) self.assertEqual(normalize(v), expected)
def _compute_global_r_mac(features, pca=None): assert len(features.shape) == 3 global_r_mac = np.zeros( (1, features.shape[2])) # Sum of all regional features macs = compute_r_macs(features) for mac in macs: if pca: mac = pca.transform(mac) mac = normalize(mac) global_r_mac += mac return normalize(global_r_mac)
def compute_r_macs(features, scales=(1, 4), verbose=False): """ Computes regional maximum activations of convolutions (see arXiv:1511.05879v2) Args: scales: inclusive interval from which the scale parameter l is chosen from, where l=1 is a square region filling the whole image. Higher scales result in smaller regions following the formula given in the paper on p.4 Returns: list of r_mac vectors of shape (1, N), where N is the depth of the convolutional feature maps """ assert len(features.shape) == 3 height, width = features.shape[0], features.shape[1] r_macs = [] for scale in range(scales[0], scales[1] + 1): r_size = max(floor(2 * min(height, width) / (scale + 1)), 1) if verbose: print('Region width at scale {}: {}'.format(scale, r_size)) # Uniform sampling of square regions with 40% overlap for region in _region_generator(features, r_size, 0.4): r_mac = _compute_mac(region) r_mac = normalize(np.expand_dims(r_mac, axis=0)) r_macs.append(r_mac) if r_size == 1: break return r_macs
def create_buffer(data, delta, steps, mode="avg"): samples = [] log("Creating chunks", steps=steps, mode=mode) for j in range(0, steps): if mode == "avg": samples.append(avg_in_area(data, j, delta)) elif mode == "max": samples.append(max_in_area(data, j, delta)) else: raise "Unsupported mode" log("Finished sample chunk creation", length=len(samples)) return normalize(samples)
def _compute_area_score(query, area, integral_image, exp=AML_EXP): """Computes cosine similarity between query representation and bounding box Args: query: L2 normalized representation of shape (1, dim) area: bounding box on integral image in the form of (left, upper, right, lower) integral_image: integral image of features on which the bounding box lies exp: constant used in approximate max pooling """ max_pool = _integral_image_sum(integral_image, area) max_pool = normalize(np.power(max_pool, 1.0 / exp)) score = max_pool.dot(query.T) return np.clip(score, -1.0, 1.0).item()
def _compute_global_r_mac(features, pca=None): """ Computes global aggregation of rmacs from convolutional features Args: pca (optional): sklearn.decomposition.PCA object which is applied to each mac to whiten the features Returns: global image descriptor of shape (1, N), where N is the depth of the convolutional feature maps """ assert len(features.shape) == 3 global_r_mac = np.zeros((1, features.shape[2])) # Sum of all regional features macs = compute_r_macs(features) for mac in macs: if pca: mac = pca.transform(mac) mac = normalize(mac) global_r_mac += mac return normalize(global_r_mac)
def test_normalize(self): normalized = util.normalize(pd.DataFrame(data={'values': [2, 3, 6]}), 2) # expectation: x -> 2 * (x - 2) / (6 - 2) self.assertEqual(normalized['values'].tolist(), [0, 0.5, 2])
def run_experiment(n_steps, n_expected_leafs, total_drift, total_diffusion, drift_density, p_settle, drift_direction, chain_length, burnin, hpd_values, working_dir, turnover=0.2, clock_rate=1.0, movement_model='rrw', max_fossil_age=0, min_n_fossils=10, **kwargs): """Run an experiment ´n_runs´ times with the specified parameters. Args: n_runs (int): Number of times the experiment should be repeated. n_steps (int): Number of steps to simulate. n_expected_leafs (int): Number data points to be expected in the end (only expected, not exact value, due to stochasticity) total_drift (float): The total distance that every society will travel due to drift over the simulated time. total_diffusion (float): The expected total distance that every society will move away from the root, due to diffusion. drift_density (float): Frequency of drift occurring (does not effect the total drift). p_settle (float): Probability of stopping drift and 'settling' at the current location (only diffusion from this point). drift_direction (np.array): The direction of drift. chain_length (int): MCMC chain length in BEAST analysis. burnin (int): MCMC burnin steps in BEAST analysis. hpd_values (list): The values for the HPD coverage statistics. Kwargs: movement_model (str): The movement to be used in BEAST analysis ('rrw' or 'brownian'). working_dir (str): The working directory in which intermediate files will be dumped. drop_fossils (bool): Remove extinct taxa from the sampled phylogeny. max_fossil_age (float): Remove all fossils older than this. min_n_fossils (int): If `max_fossil_age` is set: Ensure sampled trees have at least this many fossils. Returns: dict: Statistics of the experiments (different error values). """ # Ensure arrays to be np.array root = np.zeros(2) drift_direction = np.asarray(drift_direction) min_leaves, max_leaves = 0.4 * n_expected_leafs, 2. * n_expected_leafs # Paths xml_path = working_dir + 'nowhere.xml' # Inferred parameters drift_direction = normalize(drift_direction) step_var = total_diffusion_2_step_var(total_diffusion, n_steps) _step_drift = total_drift_2_step_drift(total_drift, n_steps, drift_density=drift_density) step_mean = _step_drift * drift_direction # Compute birth-/death-rate from n_expected_leaves, n_steps and turnover eff_div_rate = np.log(n_expected_leafs) / n_steps birth_rate = eff_div_rate / (1 - turnover) death_rate = birth_rate * turnover # b = e / (4/5) = e*5/4 # d = b * 1/5 = e*5/4/5 = e/4 # Check parameter validity if True: assert 0 < drift_density <= 1 assert 0 <= turnover < 1 assert 0 <= death_rate < birth_rate <= 1 for hpd in hpd_values: assert 0 < hpd < 100 assert burnin < chain_length valid_tree = False while not valid_tree: # Run Simulation p0 = np.zeros(2) world = VectorWorld() tree_simu = VectorState(world, p0, step_mean, step_var, clock_rate, birth_rate, drift_frequency=drift_density, death_rate=death_rate) tree_simu, world = run_simulation(n_steps, tree_simu, world, condition_on_root=True) tree_simu.drop_fossils(max_fossil_age) # Check whether tree satisfies criteria... # Criteria: not too small/big & root has two extant subtrees n_leafs = len([n for n in tree_simu.iter_leafs() if n.depth == n_steps]) valid_tree = (min_leaves < n_leafs < max_leaves) if n_leafs < min_leaves: print('Invalid: Not enough leafs: %i' % n_leafs) continue elif n_leafs > max_leaves: print('Invalid: Too many leafs: %i' % n_leafs) continue for c in tree_simu.children: if not any(n.depth == n_steps for n in c.iter_leafs()): valid_tree = False print('Invalid: One side of the tree died!') break if valid_tree and (max_fossil_age > 0): if tree_simu.height() < n_steps: # This might happen if all languages on one side of the first split go extinct. valid_tree = False print('Invalid: Tree lost in height!') elif tree_simu.n_fossils() < min_n_fossils: valid_tree = False print('Invalid: Not enough fossils (only %i)' % tree_simu.n_fossils()) print('Valid tree with %i leaves and %i fossils' % (tree_simu.n_leafs(), tree_simu.n_fossils())) if movement_model == 'tree_statistics': results = {} else: # Create an XML file as input for the BEAST analysis tree_simu.write_beast_xml(xml_path, chain_length, movement_model=movement_model, drift_prior_std=1.) # Run phylogeographic reconstruction in BEAST run_beast(working_dir=working_dir) results = evaluate(working_dir, burnin, hpd_values, root) # Add statistics about simulated tree (to compare between simulation modes) results['observed_stdev'] = np.hypot(*np.std(tree_simu.get_leaf_locations(), axis=0)) leafs_mean = np.mean(tree_simu.get_leaf_locations(), axis=0) leafs_mean_offset = leafs_mean - root results['observed_drift_x'] = leafs_mean_offset[0] results['observed_drift_y'] = leafs_mean_offset[1] results['observed_drift_norm'] = np.hypot(*leafs_mean_offset) # Always include tree stats tree_stats = tree_statistics(tree_simu) results.update(tree_stats) return results
beta1 = 0.5 Z_dim = 100 mu, sigma = 0, 1 # Load and normalize data X1 = util.getDataTumour(base_path + "data/NvAndMelTrain.pkl", value="nv", resize=64) X2 = util.getDataTumour(base_path + "data/NvAndMelTest.pkl", value="nv", resize=64) X_train = np.concatenate((X1, X2), axis=0) #X_train = util.getDataTumour(base_path + "data/NvAndMelTest.pkl", value="nv", resize=64) assert (X_train.shape[1] == X_train.shape[2]) print(X_train.shape) X_train = util.normalize(X_train, -1, 1) # Inputs tf.reset_default_graph() dataset = tf.data.Dataset.from_tensor_slices(X_train).repeat(epochs).shuffle( buffer_size=len(X_train)).batch(batchSize, drop_remainder=True) iterator = dataset.make_one_shot_iterator() X = iterator.get_next() batchSizeTensor = tf.placeholder(tf.int32) Z = network.sample_noise(batchSizeTensor, Z_dim) isTraining = tf.placeholder(dtype=tf.bool) # Networks G_z = network.generator(Z, isTraining) D_logits_real = network.discriminator(X) D_logits_fake = network.discriminator(G_z, reuse=True)
def _compute_area_score(query, area, integral_image, exp=AML_EXP): max_pool = _integral_image_sum(integral_image, area) max_pool = normalize(np.power(max_pool, 1.0 / exp)) score = max_pool.dot(query.T) return np.clip(score, -1.0, 1.0).item()
'wind_speed': 'mean', 'pressure': 'mean', 'PM2.5': 'mean', 'PM10': 'mean', 'O3': 'mean' }) fig1, axes1 = plt.subplots(nrows=1, ncols=3, figsize=(15, 3)) fig2, axes2 = plt.subplots(nrows=1, ncols=2, figsize=(10, 3)) scale = 500 # multiplier scale for enlarging plotted circles # plot normalized PM2.5 per station aggDf.plot.scatter(x='latitude', y='longitude', s=util.normalize(aggDf['PM2.5'], scale), title='PM2.5', fontsize=13, ax=axes1[0]) # plot normalized PM10 per station aggDf.plot.scatter(x='latitude', y='longitude', s=util.normalize(aggDf['PM10'], scale), title='PM10', fontsize=13, ax=axes1[1]) # plot normalized O3 per station aggDf.plot.scatter(x='latitude', y='longitude',
def main(args): assert args.dataset in ['mnist', 'cifar', 'svhn'], \ "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'" assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \ "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \ "'jsma' or 'cw'" assert os.path.isfile('../data/model_%s.h5' % args.dataset), \ 'model file not found... must first train model using train_model.py.' assert os.path.isfile('../data/Adv_%s_%s.npy' % (args.dataset, args.attack)), \ 'adversarial sample file not found... must first craft adversarial ' \ 'samples using craft_adv_samples.py' print('Loading the data and model...') # Load the model model = load_model('../data/model_%s.h5' % args.dataset) # Load the dataset X_train, Y_train, X_test, Y_test = get_data(args.dataset) # Check attack type, select adversarial and noisy samples accordingly print('Loading noisy and adversarial samples...') if args.attack == 'all': # TODO: implement 'all' option #X_test_adv = ... #X_test_noisy = ... raise NotImplementedError("'All' types detector not yet implemented.") else: # Load adversarial samples X_test_adv = np.load('../data/Adv_%s_%s.npy' % (args.dataset, args.attack)) # Craft an equal number of noisy samples X_test_noisy = get_noisy_samples(X_test, X_test_adv, args.dataset, args.attack) # Check model accuracies on each sample type for s_type, dataset in zip(['normal', 'noisy', 'adversarial'], [X_test, X_test_noisy, X_test_adv]): _, acc = model.evaluate(dataset, Y_test, batch_size=args.batch_size, verbose=0) print("Model accuracy on the %s test set: %0.2f%%" % (s_type, 100 * acc)) # Compute and display average perturbation sizes if not s_type == 'normal': l2_diff = np.linalg.norm(dataset.reshape( (len(X_test), -1)) - X_test.reshape((len(X_test), -1)), axis=1).mean() print("Average L-2 perturbation size of the %s test set: %0.2f" % (s_type, l2_diff)) # Refine the normal, noisy and adversarial sets to only include samples for # which the original version was correctly classified by the model preds_test = model.predict_classes(X_test, verbose=0, batch_size=args.batch_size) inds_correct = np.where(preds_test == Y_test.argmax(axis=1))[0] X_test = X_test[inds_correct] X_test_noisy = X_test_noisy[inds_correct] X_test_adv = X_test_adv[inds_correct] ## Get Bayesian uncertainty scores print('Getting Monte Carlo dropout variance predictions...') uncerts_normal = get_mc_predictions(model, X_test, batch_size=args.batch_size) \ .var(axis=0).mean(axis=1) uncerts_noisy = get_mc_predictions(model, X_test_noisy, batch_size=args.batch_size) \ .var(axis=0).mean(axis=1) uncerts_adv = get_mc_predictions(model, X_test_adv, batch_size=args.batch_size) \ .var(axis=0).mean(axis=1) ## Get KDE scores # Get deep feature representations print('Getting deep feature representations...') X_train_features = get_deep_representations(model, X_train, batch_size=args.batch_size) X_test_normal_features = get_deep_representations( model, X_test, batch_size=args.batch_size) X_test_noisy_features = get_deep_representations( model, X_test_noisy, batch_size=args.batch_size) X_test_adv_features = get_deep_representations(model, X_test_adv, batch_size=args.batch_size) # Train one KDE per class print('Training KDEs...') class_inds = {} for i in range(Y_train.shape[1]): class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0] kdes = {} warnings.warn( "Using pre-set kernel bandwidths that were determined " "optimal for the specific CNN models of the paper. If you've " "changed your model, you'll need to re-optimize the " "bandwidth.") for i in range(Y_train.shape[1]): kdes[i] = KernelDensity(kernel='gaussian', bandwidth=BANDWIDTHS[args.dataset]) \ .fit(X_train_features[class_inds[i]]) # Get model predictions print('Computing model predictions...') preds_test_normal = model.predict_classes(X_test, verbose=0, batch_size=args.batch_size) preds_test_noisy = model.predict_classes(X_test_noisy, verbose=0, batch_size=args.batch_size) preds_test_adv = model.predict_classes(X_test_adv, verbose=0, batch_size=args.batch_size) # Get density estimates print('computing densities...') densities_normal = score_samples(kdes, X_test_normal_features, preds_test_normal) densities_noisy = score_samples(kdes, X_test_noisy_features, preds_test_noisy) densities_adv = score_samples(kdes, X_test_adv_features, preds_test_adv) ## Z-score the uncertainty and density values uncerts_normal_z, uncerts_adv_z, uncerts_noisy_z = normalize( uncerts_normal, uncerts_adv, uncerts_noisy) densities_normal_z, densities_adv_z, densities_noisy_z = normalize( densities_normal, densities_adv, densities_noisy) ## Build detector values, labels, lr = train_lr(densities_pos=densities_adv_z, densities_neg=np.concatenate( (densities_normal_z, densities_noisy_z)), uncerts_pos=uncerts_adv_z, uncerts_neg=np.concatenate( (uncerts_normal_z, uncerts_noisy_z))) ## Evaluate detector # Compute logistic regression model predictions probs = lr.predict_proba(values)[:, 1] # Compute AUC n_samples = len(X_test) # The first 2/3 of 'probs' is the negative class (normal and noisy samples), # and the last 1/3 is the positive class (adversarial samples). _, _, auc_score = compute_roc(probs_neg=probs[:2 * n_samples], probs_pos=probs[2 * n_samples:]) print('Detector ROC-AUC score: %0.4f' % auc_score)
station['SMAPE'] = util.SMAPE(actual=actual, forecast=forecast) smape = pd.DataFrame( data=[[city, station[const.ID], station[const.LONG], station[const.LAT], pollutant, station['SMAPE'], actual.size]], columns=smape_columns) local_smapes = local_smapes.append(other=smape, ignore_index=True) smapes = smapes.append(other=smape, ignore_index=True) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 3)) # Plot SMAPE values sorted local_smapes.sort_values(by='SMAPE', inplace=True) g = sns.stripplot(x=const.ID, y='SMAPE', data=local_smapes, ax=axes[0]) g.set_xticklabels(labels=g.get_xticklabels(), rotation=90) # rotate station names for readability # Plot SMAPE values on map local_smapes.plot.scatter(x=const.LONG, y=const.LAT, s=util.normalize(local_smapes['SMAPE'], multiplier=150), title=city + '_' + pollutant, fontsize=13, ax=axes[1]) # Plot station names on positions for _, station in stations_dict.items(): if 'SMAPE' in station: label = ('%d ' % (100 * station['SMAPE'])) + station[const.ID][0:2] # 64 be axes[1].annotate(label, xy=(station[const.LONG], station[const.LAT]), xytext=(5, 0), textcoords='offset points', ) plt.draw() # Calculate total error total_smape = np.sum(smapes['SMAPE'] * smapes['count']) / np.sum(smapes['count']) print('Total SMAPE:', total_smape) plt.show()
def compute_localization_representation(features): mac = _compute_mac(features) return normalize(mac)
def compute_localization_representation(features): """Computes a L2 normalized representation of convolutional features suitable to localization """ mac = _compute_mac(features) return normalize(mac)