Esempio n. 1
0
def normalize_transactions(trans):
    #print (trans)
    for i in range(len(trans)):
        trans[i][0] = util.normalize(trans[i][0], amount_stat)
        trans[i][1] = util.normalize(trans[i][1], date_stat)

    return trans
Esempio n. 2
0
def normalize_account(account, bin_sizes):
    out = np.zeros((np.sum(bin_sizes)), dtype=np.float32)
    out[0] = util.normalize(account[0], open_stat)
    out[1 + account[1]] = 1
    out[1 + bin_sizes[1]] = util.normalize(account[2], recent_stat)
    out[2 + bin_sizes[1]] = util.normalize(account[3], dormant_stat)
    out[3 + bin_sizes[1] + account[4]] = 1

    return out
Esempio n. 3
0
    def test_normalize(self):
        from src.util import normalize
        v = np.array([[1.0], [2.0], [3.0]])
        v_ln = np.sqrt(1.0**2 + 2.0**2 + 3.0**2)
        expected = np.array([[1.0 / v_ln], [2.0 / v_ln], [3.0 / v_ln]])
        self.assertEqual(normalize(v), expected)

        v = np.array([[0.0], [0.0], [0.0]])
        expected = np.array([[0.0], [0.0], [0.0]])
        self.assertEqual(normalize(v), expected)
Esempio n. 4
0
def _compute_global_r_mac(features, pca=None):

    assert len(features.shape) == 3

    global_r_mac = np.zeros(
        (1, features.shape[2]))  # Sum of all regional features
    macs = compute_r_macs(features)

    for mac in macs:
        if pca:
            mac = pca.transform(mac)
            mac = normalize(mac)
        global_r_mac += mac

    return normalize(global_r_mac)
Esempio n. 5
0
def compute_r_macs(features, scales=(1, 4), verbose=False):
    """
    Computes regional maximum activations of convolutions
    (see arXiv:1511.05879v2)

    Args:
    scales: inclusive interval from which the scale parameter l is chosen
        from, where l=1 is a square region filling the whole image.
        Higher scales result in smaller regions following the formula
        given in the paper on p.4

    Returns: list of r_mac vectors of shape (1, N), where N is the
    depth of the convolutional feature maps
    """
    assert len(features.shape) == 3
    height, width = features.shape[0], features.shape[1]

    r_macs = []
    for scale in range(scales[0], scales[1] + 1):
        r_size = max(floor(2 * min(height, width) / (scale + 1)), 1)
        if verbose:
            print('Region width at scale {}: {}'.format(scale, r_size))

        # Uniform sampling of square regions with 40% overlap
        for region in _region_generator(features, r_size, 0.4):
            r_mac = _compute_mac(region)
            r_mac = normalize(np.expand_dims(r_mac, axis=0))
            r_macs.append(r_mac)

        if r_size == 1:
            break
    return r_macs
Esempio n. 6
0
def create_buffer(data, delta, steps, mode="avg"):
  samples = []
  log("Creating chunks", steps=steps, mode=mode)
  for j in range(0, steps):
    if mode == "avg":
      samples.append(avg_in_area(data, j, delta))
    elif mode == "max":
      samples.append(max_in_area(data, j, delta))
    else:
      raise "Unsupported mode"
    
  log("Finished sample chunk creation", length=len(samples))
  return normalize(samples)
Esempio n. 7
0
def _compute_area_score(query, area, integral_image, exp=AML_EXP):
    """Computes cosine similarity between query representation and bounding box

    Args:
    query: L2 normalized representation of shape (1, dim)
    area: bounding box on integral image in the form of (left, upper, right, lower)
    integral_image: integral image of features on which the bounding box lies
    exp: constant used in approximate max pooling
    """
    max_pool = _integral_image_sum(integral_image, area)
    max_pool = normalize(np.power(max_pool, 1.0 / exp))
    score = max_pool.dot(query.T)
    return np.clip(score, -1.0, 1.0).item()
Esempio n. 8
0
def _compute_global_r_mac(features, pca=None):
    """
    Computes global aggregation of rmacs from convolutional features

    Args:
    pca (optional): sklearn.decomposition.PCA object which is applied to each 
    mac to whiten the features
    
    Returns: global image descriptor of shape (1, N), where N is the 
    depth of the convolutional feature maps
    """
    assert len(features.shape) == 3
    
    global_r_mac = np.zeros((1, features.shape[2]))  # Sum of all regional features
    macs = compute_r_macs(features)
    
    for mac in macs:
        if pca:
            mac = pca.transform(mac)
            mac = normalize(mac)
        global_r_mac += mac

    return normalize(global_r_mac)
Esempio n. 9
0
 def test_normalize(self):
     normalized = util.normalize(pd.DataFrame(data={'values': [2, 3, 6]}),
                                 2)
     # expectation: x -> 2 * (x - 2) / (6 - 2)
     self.assertEqual(normalized['values'].tolist(), [0, 0.5, 2])
Esempio n. 10
0
def run_experiment(n_steps, n_expected_leafs, total_drift,
                   total_diffusion, drift_density, p_settle, drift_direction,
                   chain_length, burnin, hpd_values, working_dir,
                   turnover=0.2, clock_rate=1.0, movement_model='rrw',
                   max_fossil_age=0, min_n_fossils=10, **kwargs):
    """Run an experiment ´n_runs´ times with the specified parameters.

    Args:
        n_runs (int): Number of times the experiment should be repeated.
        n_steps (int): Number of steps to simulate.
        n_expected_leafs (int): Number data points to be expected in the end
            (only expected, not exact value, due to stochasticity)
        total_drift (float): The total distance that every society will travel
            due to drift over the simulated time.
        total_diffusion (float): The expected total distance that every society
            will move away from the root, due to diffusion.
        drift_density (float): Frequency of drift occurring (does not effect
            the total drift).
        p_settle (float): Probability of stopping drift and 'settling' at the
            current location (only diffusion from this point).
        drift_direction (np.array): The direction of drift.
        chain_length (int): MCMC chain length in BEAST analysis.
        burnin (int): MCMC burnin steps in BEAST analysis.
        hpd_values (list): The values for the HPD coverage statistics.

    Kwargs:
        movement_model (str): The movement to be used in BEAST analysis
            ('rrw' or 'brownian').
        working_dir (str): The working directory in which intermediate files
            will be dumped.
        drop_fossils (bool): Remove extinct taxa from the sampled phylogeny.
        max_fossil_age (float): Remove all fossils older than this.
        min_n_fossils (int): If `max_fossil_age` is set: Ensure sampled trees
            have at least this many fossils.

    Returns:
        dict: Statistics of the experiments (different error values).
    """
    # Ensure arrays to be np.array
    root = np.zeros(2)
    drift_direction = np.asarray(drift_direction)
    min_leaves, max_leaves = 0.4 * n_expected_leafs, 2. * n_expected_leafs

    # Paths
    xml_path = working_dir + 'nowhere.xml'

    # Inferred parameters
    drift_direction = normalize(drift_direction)
    step_var = total_diffusion_2_step_var(total_diffusion, n_steps)
    _step_drift = total_drift_2_step_drift(total_drift, n_steps, drift_density=drift_density)
    step_mean = _step_drift * drift_direction

    # Compute birth-/death-rate from n_expected_leaves, n_steps and turnover
    eff_div_rate = np.log(n_expected_leafs) / n_steps
    birth_rate = eff_div_rate / (1 - turnover)
    death_rate = birth_rate * turnover

    # b = e / (4/5) = e*5/4
    # d = b * 1/5 = e*5/4/5 = e/4

    # Check parameter validity
    if True:
        assert 0 < drift_density <= 1
        assert 0 <= turnover < 1
        assert 0 <= death_rate < birth_rate <= 1
        for hpd in hpd_values:
            assert 0 < hpd < 100
        assert burnin < chain_length

    valid_tree = False
    while not valid_tree:
        # Run Simulation
        p0 = np.zeros(2)
        world = VectorWorld()
        tree_simu = VectorState(world, p0, step_mean, step_var, clock_rate, birth_rate,
                                drift_frequency=drift_density, death_rate=death_rate)
        tree_simu, world = run_simulation(n_steps, tree_simu, world, condition_on_root=True)
        tree_simu.drop_fossils(max_fossil_age)

        # Check whether tree satisfies criteria...
        #    Criteria: not too small/big & root has two extant subtrees
        n_leafs = len([n for n in tree_simu.iter_leafs() if n.depth == n_steps])
        valid_tree = (min_leaves < n_leafs < max_leaves)

        if n_leafs < min_leaves:
            print('Invalid: Not enough leafs: %i' % n_leafs)
            continue
        elif n_leafs > max_leaves:
            print('Invalid: Too many leafs: %i' % n_leafs)
            continue
        for c in tree_simu.children:
            if not any(n.depth == n_steps for n in c.iter_leafs()):
                valid_tree = False
                print('Invalid: One side of the tree died!')
                break

        if valid_tree and (max_fossil_age > 0):
            if tree_simu.height() < n_steps:
                # This might happen if all languages on one side of the first split go extinct.
                valid_tree = False
                print('Invalid: Tree lost in height!')
            elif tree_simu.n_fossils() < min_n_fossils:
                valid_tree = False
                print('Invalid: Not enough fossils (only %i)' % tree_simu.n_fossils())

    print('Valid tree with %i leaves and %i fossils' % (tree_simu.n_leafs(), tree_simu.n_fossils()))
    if movement_model == 'tree_statistics':
        results = {}

    else:

        # Create an XML file as input for the BEAST analysis
        tree_simu.write_beast_xml(xml_path, chain_length, movement_model=movement_model,
                                  drift_prior_std=1.)

        # Run phylogeographic reconstruction in BEAST
        run_beast(working_dir=working_dir)

        results = evaluate(working_dir, burnin, hpd_values, root)

        # Add statistics about simulated tree (to compare between simulation modes)
        results['observed_stdev'] = np.hypot(*np.std(tree_simu.get_leaf_locations(), axis=0))
        leafs_mean = np.mean(tree_simu.get_leaf_locations(), axis=0)
        leafs_mean_offset = leafs_mean - root
        results['observed_drift_x'] = leafs_mean_offset[0]
        results['observed_drift_y'] = leafs_mean_offset[1]
        results['observed_drift_norm'] = np.hypot(*leafs_mean_offset)

    # Always include tree stats
    tree_stats = tree_statistics(tree_simu)
    results.update(tree_stats)

    return results
Esempio n. 11
0
beta1 = 0.5
Z_dim = 100
mu, sigma = 0, 1

# Load and normalize data
X1 = util.getDataTumour(base_path + "data/NvAndMelTrain.pkl",
                        value="nv",
                        resize=64)
X2 = util.getDataTumour(base_path + "data/NvAndMelTest.pkl",
                        value="nv",
                        resize=64)
X_train = np.concatenate((X1, X2), axis=0)
#X_train = util.getDataTumour(base_path + "data/NvAndMelTest.pkl", value="nv", resize=64)
assert (X_train.shape[1] == X_train.shape[2])
print(X_train.shape)
X_train = util.normalize(X_train, -1, 1)

# Inputs
tf.reset_default_graph()
dataset = tf.data.Dataset.from_tensor_slices(X_train).repeat(epochs).shuffle(
    buffer_size=len(X_train)).batch(batchSize, drop_remainder=True)
iterator = dataset.make_one_shot_iterator()
X = iterator.get_next()
batchSizeTensor = tf.placeholder(tf.int32)
Z = network.sample_noise(batchSizeTensor, Z_dim)
isTraining = tf.placeholder(dtype=tf.bool)

# Networks
G_z = network.generator(Z, isTraining)
D_logits_real = network.discriminator(X)
D_logits_fake = network.discriminator(G_z, reuse=True)
Esempio n. 12
0
def _compute_area_score(query, area, integral_image, exp=AML_EXP):
    max_pool = _integral_image_sum(integral_image, area)
    max_pool = normalize(np.power(max_pool, 1.0 / exp))
    score = max_pool.dot(query.T)
    return np.clip(score, -1.0, 1.0).item()
Esempio n. 13
0
    'wind_speed': 'mean',
    'pressure': 'mean',
    'PM2.5': 'mean',
    'PM10': 'mean',
    'O3': 'mean'
})

fig1, axes1 = plt.subplots(nrows=1, ncols=3, figsize=(15, 3))
fig2, axes2 = plt.subplots(nrows=1, ncols=2, figsize=(10, 3))

scale = 500  # multiplier scale for enlarging plotted circles

# plot normalized PM2.5 per station
aggDf.plot.scatter(x='latitude',
                   y='longitude',
                   s=util.normalize(aggDf['PM2.5'], scale),
                   title='PM2.5',
                   fontsize=13,
                   ax=axes1[0])

# plot normalized PM10 per station
aggDf.plot.scatter(x='latitude',
                   y='longitude',
                   s=util.normalize(aggDf['PM10'], scale),
                   title='PM10',
                   fontsize=13,
                   ax=axes1[1])

# plot normalized O3 per station
aggDf.plot.scatter(x='latitude',
                   y='longitude',
def main(args):
    assert args.dataset in ['mnist', 'cifar', 'svhn'], \
        "Dataset parameter must be either 'mnist', 'cifar' or 'svhn'"
    assert args.attack in ['fgsm', 'bim-a', 'bim-b', 'jsma', 'cw', 'all'], \
        "Attack parameter must be either 'fgsm', 'bim-a', 'bim-b', " \
        "'jsma' or 'cw'"
    assert os.path.isfile('../data/model_%s.h5' % args.dataset), \
        'model file not found... must first train model using train_model.py.'
    assert os.path.isfile('../data/Adv_%s_%s.npy' %
                          (args.dataset, args.attack)), \
        'adversarial sample file not found... must first craft adversarial ' \
        'samples using craft_adv_samples.py'
    print('Loading the data and model...')
    # Load the model
    model = load_model('../data/model_%s.h5' % args.dataset)
    # Load the dataset
    X_train, Y_train, X_test, Y_test = get_data(args.dataset)
    # Check attack type, select adversarial and noisy samples accordingly
    print('Loading noisy and adversarial samples...')
    if args.attack == 'all':
        # TODO: implement 'all' option
        #X_test_adv = ...
        #X_test_noisy = ...
        raise NotImplementedError("'All' types detector not yet implemented.")
    else:
        # Load adversarial samples
        X_test_adv = np.load('../data/Adv_%s_%s.npy' %
                             (args.dataset, args.attack))
        # Craft an equal number of noisy samples
        X_test_noisy = get_noisy_samples(X_test, X_test_adv, args.dataset,
                                         args.attack)
    # Check model accuracies on each sample type
    for s_type, dataset in zip(['normal', 'noisy', 'adversarial'],
                               [X_test, X_test_noisy, X_test_adv]):
        _, acc = model.evaluate(dataset,
                                Y_test,
                                batch_size=args.batch_size,
                                verbose=0)
        print("Model accuracy on the %s test set: %0.2f%%" %
              (s_type, 100 * acc))
        # Compute and display average perturbation sizes
        if not s_type == 'normal':
            l2_diff = np.linalg.norm(dataset.reshape(
                (len(X_test), -1)) - X_test.reshape((len(X_test), -1)),
                                     axis=1).mean()
            print("Average L-2 perturbation size of the %s test set: %0.2f" %
                  (s_type, l2_diff))
    # Refine the normal, noisy and adversarial sets to only include samples for
    # which the original version was correctly classified by the model
    preds_test = model.predict_classes(X_test,
                                       verbose=0,
                                       batch_size=args.batch_size)
    inds_correct = np.where(preds_test == Y_test.argmax(axis=1))[0]
    X_test = X_test[inds_correct]
    X_test_noisy = X_test_noisy[inds_correct]
    X_test_adv = X_test_adv[inds_correct]

    ## Get Bayesian uncertainty scores
    print('Getting Monte Carlo dropout variance predictions...')
    uncerts_normal = get_mc_predictions(model, X_test,
                                        batch_size=args.batch_size) \
        .var(axis=0).mean(axis=1)
    uncerts_noisy = get_mc_predictions(model, X_test_noisy,
                                       batch_size=args.batch_size) \
        .var(axis=0).mean(axis=1)
    uncerts_adv = get_mc_predictions(model, X_test_adv,
                                     batch_size=args.batch_size) \
        .var(axis=0).mean(axis=1)

    ## Get KDE scores
    # Get deep feature representations
    print('Getting deep feature representations...')
    X_train_features = get_deep_representations(model,
                                                X_train,
                                                batch_size=args.batch_size)
    X_test_normal_features = get_deep_representations(
        model, X_test, batch_size=args.batch_size)
    X_test_noisy_features = get_deep_representations(
        model, X_test_noisy, batch_size=args.batch_size)
    X_test_adv_features = get_deep_representations(model,
                                                   X_test_adv,
                                                   batch_size=args.batch_size)
    # Train one KDE per class
    print('Training KDEs...')
    class_inds = {}
    for i in range(Y_train.shape[1]):
        class_inds[i] = np.where(Y_train.argmax(axis=1) == i)[0]
    kdes = {}
    warnings.warn(
        "Using pre-set kernel bandwidths that were determined "
        "optimal for the specific CNN models of the paper. If you've "
        "changed your model, you'll need to re-optimize the "
        "bandwidth.")
    for i in range(Y_train.shape[1]):
        kdes[i] = KernelDensity(kernel='gaussian',
                                bandwidth=BANDWIDTHS[args.dataset]) \
            .fit(X_train_features[class_inds[i]])
    # Get model predictions
    print('Computing model predictions...')
    preds_test_normal = model.predict_classes(X_test,
                                              verbose=0,
                                              batch_size=args.batch_size)
    preds_test_noisy = model.predict_classes(X_test_noisy,
                                             verbose=0,
                                             batch_size=args.batch_size)
    preds_test_adv = model.predict_classes(X_test_adv,
                                           verbose=0,
                                           batch_size=args.batch_size)
    # Get density estimates
    print('computing densities...')
    densities_normal = score_samples(kdes, X_test_normal_features,
                                     preds_test_normal)
    densities_noisy = score_samples(kdes, X_test_noisy_features,
                                    preds_test_noisy)
    densities_adv = score_samples(kdes, X_test_adv_features, preds_test_adv)

    ## Z-score the uncertainty and density values
    uncerts_normal_z, uncerts_adv_z, uncerts_noisy_z = normalize(
        uncerts_normal, uncerts_adv, uncerts_noisy)
    densities_normal_z, densities_adv_z, densities_noisy_z = normalize(
        densities_normal, densities_adv, densities_noisy)

    ## Build detector
    values, labels, lr = train_lr(densities_pos=densities_adv_z,
                                  densities_neg=np.concatenate(
                                      (densities_normal_z, densities_noisy_z)),
                                  uncerts_pos=uncerts_adv_z,
                                  uncerts_neg=np.concatenate(
                                      (uncerts_normal_z, uncerts_noisy_z)))

    ## Evaluate detector
    # Compute logistic regression model predictions
    probs = lr.predict_proba(values)[:, 1]
    # Compute AUC
    n_samples = len(X_test)
    # The first 2/3 of 'probs' is the negative class (normal and noisy samples),
    # and the last 1/3 is the positive class (adversarial samples).
    _, _, auc_score = compute_roc(probs_neg=probs[:2 * n_samples],
                                  probs_pos=probs[2 * n_samples:])
    print('Detector ROC-AUC score: %0.4f' % auc_score)
            station['SMAPE'] = util.SMAPE(actual=actual, forecast=forecast)
            smape = pd.DataFrame(
                    data=[[city, station[const.ID], station[const.LONG], station[const.LAT],
                           pollutant, station['SMAPE'], actual.size]],
                    columns=smape_columns)
            local_smapes = local_smapes.append(other=smape, ignore_index=True)
            smapes = smapes.append(other=smape, ignore_index=True)

        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 3))

        # Plot SMAPE values sorted
        local_smapes.sort_values(by='SMAPE', inplace=True)
        g = sns.stripplot(x=const.ID, y='SMAPE', data=local_smapes, ax=axes[0])
        g.set_xticklabels(labels=g.get_xticklabels(), rotation=90)  # rotate station names for readability

        # Plot SMAPE values on map
        local_smapes.plot.scatter(x=const.LONG, y=const.LAT, s=util.normalize(local_smapes['SMAPE'], multiplier=150),
                            title=city + '_' + pollutant, fontsize=13, ax=axes[1])
        # Plot station names on positions
        for _, station in stations_dict.items():
            if 'SMAPE' in station:
                label = ('%d ' % (100 * station['SMAPE'])) + station[const.ID][0:2]  # 64 be
                axes[1].annotate(label, xy=(station[const.LONG], station[const.LAT]),
                                 xytext=(5, 0), textcoords='offset points', )
        plt.draw()

# Calculate total error
total_smape = np.sum(smapes['SMAPE'] * smapes['count']) / np.sum(smapes['count'])
print('Total SMAPE:', total_smape)

plt.show()
Esempio n. 16
0
def compute_localization_representation(features):

    mac = _compute_mac(features)
    return normalize(mac)
Esempio n. 17
0
def compute_localization_representation(features):
    """Computes a L2 normalized representation of convolutional features 
    suitable to localization
    """
    mac = _compute_mac(features)
    return normalize(mac)