Example #1
0
def create_datasets():
    rs = RandomState(0)
    data_a = np.sort(rs.normal(0, 10, 500)).astype(int).reshape(100, 5)
    gene_names_a = list("ABCDE")
    cell_types_a = ["alpha", "beta", "gamma", "delta"]
    labels_a = rs.choice(np.arange(len(cell_types_a)), data_a.shape[0])
    batch_indices_a = np.random.choice(np.arange(5), size=data_a.shape[0])

    data_b = np.sort(rs.normal(100, 10, 300)).astype(int).reshape(100, 3)
    gene_names_b = list("BFA")
    cell_types_b = ["alpha", "epsilon", "rho"]
    labels_b = rs.choice(np.arange(len(cell_types_b)), data_b.shape[0])
    batch_indices_b = rs.choice(np.arange(5), size=data_b.shape[0])

    dataset_a = GeneExpressionDataset()
    dataset_b = GeneExpressionDataset()
    dataset_a.populate_from_data(X=data_a,
                                 labels=labels_a,
                                 gene_names=gene_names_a,
                                 cell_types=cell_types_a,
                                 batch_indices=batch_indices_a)
    dataset_a.name = "test_a"

    dataset_b.populate_from_data(X=data_b,
                                 labels=labels_b,
                                 gene_names=gene_names_b,
                                 cell_types=cell_types_b,
                                 batch_indices=batch_indices_b)
    dataset_b.name = "test_b"
    return dataset_a, dataset_b
def test_update_fundamental_matrix():
    prng = RandomState(20150101)
    P = compute_transition_matrix(karate_club_graph())
    n = P.shape[0]
    order = arange(P.shape[0])
    previous_index = prng.choice(order, 1)
    previous_node = order[previous_index]
    non_absorbing_nodes = chain(range(previous_index),
                                range(previous_index + 1, n))
    non_absorbing_nodes = list(non_absorbing_nodes)
    order = order[non_absorbing_nodes]
    F = compute_fundamental_matrix(
        P[non_absorbing_nodes, :][:, non_absorbing_nodes])
    absorbing_nodes = [previous_node]
    P_updated = P.copy()
    F_updated = F
    while P_updated.shape[0] >= 3:
        next_node = order[prng.choice(len(order), 1)]
        (P_updated, F_updated, order, previous_index) = \
            update_fundamental_matrix(P_updated, F_updated, next=next_node,
                                      previous=previous_node,
                                      previous_index=previous_index,
                                      node_order=order)
        previous_node = next_node
        absorbing_nodes.append(next_node)
        non_absorbing_nodes = [x for x in range(n) if x not in absorbing_nodes]
        F_slow = compute_fundamental_matrix(
            P[non_absorbing_nodes, :][:, non_absorbing_nodes])
        error_at_step = sum(sum(F_updated - F_slow).T)[0, 0]
        assert abs(error_at_step) < 1e-8, "Error is more than 1e-8."
Example #3
0
def test_unrelated_columns(N=60, random_seed=12345):
    """
    Test to see if 'unrelated' columns jam up the analysis.
    See Github Issue 43.
    https://github.com/ACCLAB/DABEST-python/issues/44.
    
    Added in v0.2.5.
    """

    # rng = RandomState(MT19937(random_seed))
    rng = RandomState(PCG64(12345))
    # rng = np.random.default_rng(seed=random_seed)

    df = pd.DataFrame({
        'groups':
        rng.choice(['Group 1', 'Group 2', 'Group 3'], size=(N, )),
        'color':
        rng.choice(['green', 'red', 'purple'], size=(N, )),
        'value':
        rng.random(size=(N, ))
    })

    df['unrelated'] = np.nan

    test = load(data=df, x='groups', y='value', idx=['Group 1', 'Group 2'])

    md = test.mean_diff.results

    assert md.difference[0] == pytest.approx(-0.0322, abs=1e-4)
    assert md.bca_low[0] == pytest.approx(-0.2279, abs=1e-4)
    assert md.bca_high[0] == pytest.approx(0.1613, abs=1e-4)
Example #4
0
def make_ratings(n_users, n_items, min_rating_per_user, max_rating_per_user,
                 rating_choices, seed=None, shuffle=True):
    """Randomly generate a (user_id, item_id, rating) array

    Return
    ------
        ndarray with shape (n_samples, 3)

    """
    if not (isinstance(rating_choices, list) or
            isinstance(rating_choices, tuple)):
        raise ValueError("'rating_choices' must be a list or tuple")
    if min_rating_per_user < 0 or min_rating_per_user >= n_items:
        raise ValueError("invalid 'min_rating_per_user' invalid")
    if (min_rating_per_user > max_rating_per_user) or \
       (max_rating_per_user >= n_items):
        raise ValueError("invalid 'max_rating_per_user' invalid")

    rs = RandomState(seed=seed)
    user_arrs = []
    for user_id in xrange(n_users):
        item_count = rs.randint(min_rating_per_user, max_rating_per_user)
        item_ids = rs.choice(n_items, item_count, replace=False)
        ratings = rs.choice(rating_choices, item_count)
        arr = np.stack(
            [np.repeat(user_id, item_count), item_ids, ratings], axis=1)
        user_arrs.append(arr)

    ratings = np.array(np.vstack(user_arrs))
    ratings[:, 2] = ratings[:, 2].astype('float')
    if shuffle:
        rs.shuffle(ratings)
    return ratings
def test_update_fundamental_matrix():
    prng = RandomState(20150101)
    P = compute_transition_matrix(karate_club_graph())
    n = P.shape[0]
    order = arange(P.shape[0])
    previous_index = prng.choice(order, 1)
    previous_node = order[previous_index]
    non_absorbing_nodes = chain(range(previous_index),
                                range(previous_index + 1, n))
    non_absorbing_nodes = list(non_absorbing_nodes)
    order = order[non_absorbing_nodes]
    F = compute_fundamental_matrix(P[non_absorbing_nodes, :]
                                   [:, non_absorbing_nodes])
    absorbing_nodes = [previous_node]
    P_updated = P.copy()
    F_updated = F
    while P_updated.shape[0] >= 3:
        next_node = order[prng.choice(len(order), 1)]
        (P_updated, F_updated, order, previous_index) = \
            update_fundamental_matrix(P_updated, F_updated, next=next_node,
                                      previous=previous_node,
                                      previous_index=previous_index,
                                      node_order=order)
        previous_node = next_node
        absorbing_nodes.append(next_node)
        non_absorbing_nodes = [x for x in range(n) if x not in absorbing_nodes]
        F_slow = compute_fundamental_matrix(P[non_absorbing_nodes, :]
                                            [:, non_absorbing_nodes])
        error_at_step = sum(sum(F_updated - F_slow).T)[0, 0]
        assert abs(error_at_step) < 1e-8, "Error is more than 1e-8."
Example #6
0
def merge_two_zones(zones, np1, np2, seed=None):
    #
    rnd = RandomState()
    if seed is not None:
        rnd = RandomState(seed)
    #
    i = rnd.choice(a=range(0, np2))
    j = rnd.choice(a=range(0, np1))
    i_ = i
    j_ = j
    dir_ = rnd.choice(a=[0, 1])
    if dir_ == 0:
        if 0 < i < np2 - 1:
            i_ = rnd.choice(a=[i - 1, i + 1])
        elif i == 0:
            i_ = 1
        else:
            i_ = i - 1
    else:
        if 0 < j < np1 - 1:
            j_ = rnd.choice(a=[j - 1, j + 1])
        elif j == 0:
            j_ = 1
        else:
            j_ = j - 1
    zones_ = {
        k: nodes
        for k, nodes in zones.iteritems() if k != (i, j) and k != (i_, j_)
    }
    new_zone = list(zones[(i, j)])
    new_zone.extend(zones[i_, j_])
    zones_["m"] = new_zone
    return zones_
Example #7
0
    def create_random_selection(self,
                                N_elements=None,
                                scan_percentage=None,
                                random_type=equal,
                                sort_dimensions=False):

        rs = RandomState(seed=0)
        rs.choice(a, 5, p=np.exp(-a) / sum(np.exp(-a)), replace=False)
def test_segmentation():
    PRNG = RandomState()
    PRNG2 = RandomState()
    if args.seed > 0:
        PRNG.seed(args.seed)
        PRNG2.seed(args.seed)

    transform = Compose(
        [
            [ColorJitter(prob=0.75), None],
            Merge(),
            Expand((0.8, 1.5)),
            RandomCompose([
                # RandomResize(1, 1.5),
                RandomRotate(10),
                RandomShift(0.1)
            ]),
            Scale(300),
            # ElasticTransform(100),
            RandomCrop(300),
            HorizontalFlip(),
            Split([0, 3], [3, 6]),
            #[SubtractMean(mean=VOC.MEAN), None],
        ],
        PRNG,
        border='constant',
        fillval=VOC.MEAN,
        anchor_index=3)

    voc_dataset = VOCSegmentation(root=args.root,
                                  image_set=[('2007', 'trainval')],
                                  transform=transform,
                                  instance=False)
    viz = Viz()

    results = []
    count = 0
    i = PRNG2.choice(len(voc_dataset))
    for _ in range(1000):
        img, target = voc_dataset[i]
        img2 = viz.blend_segmentation(img, target)

        con = np.hstack([img, target, img2])
        results.append(con)
        cv2.imshow('result', con[..., ::-1])
        c = cv2.waitKey(500)

        if c == 27 or c == ord('q'):  # ESC / 'q'
            break
        elif c == ord('c') or count >= 3:
            count = 0
            i = PRNG2.choice(len(voc_dataset))
        count += 1
Example #9
0
def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
                              resamples=5000, random_seed=12345):
    """Bootstraps the effect_size for 2 groups."""
    
    from . import effsize as __es
    import numpy as np
    from numpy.random import PCG64, RandomState
    
    # rng = RandomState(default_rng(random_seed))
    rng = RandomState(PCG64(random_seed))

    out = np.repeat(np.nan, resamples)
    x0_len = len(x0)
    x1_len = len(x1)
    
    for i in range(int(resamples)):
        
        if is_paired:
            if x0_len != x1_len:
                raise ValueError("The two arrays do not have the same length.")
            random_idx = rng.choice(x0_len, x0_len, replace=True)
            x0_sample = x0[random_idx]
            x1_sample = x1[random_idx]
        else:
            x0_sample = rng.choice(x0, x0_len, replace=True)
            x1_sample = rng.choice(x1, x1_len, replace=True)
            
        out[i] = __es.two_group_difference(x0_sample, x1_sample,
                                          is_paired, effect_size)
    
    # check whether there are any infinities in the bootstrap,
    # which likely indicates the sample sizes are too small as
    # the computation of Cohen's d and Hedges' g necessitated 
    # a division by zero.
    # Added in v0.2.6.
    
    # num_infinities = len(out[np.isinf(out)])
    # print(num_infinities)
    # if num_infinities > 0:
    #     warn_msg = "There are {} bootstraps that are not defined. "\
    #     "This is likely due to smaple sample sizes. "\
    #     "The values in a bootstrap for a group will be more likely "\
    #     "to be all equal, with a resulting variance of zero. "\
    #     "The computation of Cohen's d and Hedges' g will therefore "\
    #     "involved a division by zero. "
    #     warnings.warn(warn_msg.format(num_infinities), category="UserWarning")
        
    return out
    def test_RandomRectangularPattern_ca_3ch_postit(self):
        rso = RandomState(1)
        state_tuple = rso.get_state()
        t = image_triggers.RandomRectangularPattern(
            3,
            3,
            3,
            color_algorithm='channel_assign',
            color_options={'cval': [255, 254, 253]},
            pattern_style='postit',
            random_state_obj=rso)
        actual_img = t.get_data()
        actual_mask = t.get_mask()

        # reset the random state and generate the pattern in the same manner
        rso.set_state(state_tuple)
        per_chan_expected_img = rso.choice(2, 3 * 3).reshape(
            (3, 3)).astype(bool)
        expected_img = np.zeros((3, 3, 3))
        expected_img[:, :, 0] = per_chan_expected_img * 255  # the color
        expected_img[:, :, 1] = per_chan_expected_img * 254  # the color
        expected_img[:, :, 2] = per_chan_expected_img * 253  # the color
        expected_mask = np.ones((3, 3)).astype(bool)
        self.assertTrue(np.array_equal(actual_img, expected_img))
        self.assertTrue(np.array_equal(actual_mask, expected_mask))
Example #11
0
def update_metropolis(field: np.ndarray, states: States, free_energy: float, interaction: Interaction,
                      interaction_coefficient: float, magnetization_coefficient: float, temperature: float,
                      random_state: RandomState) -> (np.ndarray, float):
    assert states
    assert field.shape[0] == field.shape[1]

    size = field.shape[0]
    min_x = 0 if FIX_LEFT is None else 1
    max_x = size if FIX_RIGHT is None else size - 1
    min_y = 0 if FIX_BOTTOM is None else 1
    max_y = size if FIX_TOP is None else size - 1
    random_x = random_state.randint(min_x, max_x)  # dim
    random_y = random_state.randint(min_y, max_y)

    new_spin = field[random_x, random_y]
    # spin flip always needs to lead to a change of spin
    while new_spin == field[random_x, random_y]:
        new_spin = random_state.choice(states)

    energy_delta, field_updated = calculate_energy_difference(field, random_x, random_y, new_spin, interaction,
                                                              interaction_coefficient, magnetization_coefficient)
    random_number = random_state.uniform()
    acceptance_probability = np.exp(-1. / temperature * energy_delta)
    print_if_verbose(f'Energy delta: {energy_delta}, random number: {random_number}, '
                     f'acceptance_probability: {acceptance_probability}')
    if energy_delta <= 0 or random_number < acceptance_probability:
        # free_energy_updated = free_energy - energy_delta
        print_if_verbose('Change accepted')
        return field_updated, free_energy - energy_delta
    else:
        print_if_verbose('Not accepted')
        return field, free_energy
Example #12
0
def sample_group_counts(
    random_state: RandomState, total: int, lam_low: float = 1.0, lam_high: float = 8.0
) -> List[int]:
    """
    Sample a list of integers which sum up to `total`.
    The probability of sampling an integer follows exponential decay, k ~ np.exp(-k * lam),
    where lam is a hyperparam sampled from a range [lam_low, lam_high).

    :param random_state: numpy random state
    :param total: the expected sum of sampled numbers.
    :param lam_low: lower bound for lambda in exponential decay.
    :param lam_high: higher bound for lambda in exponential decay.
    :return:
    """
    current_max = total
    counts = []
    while current_max > 0:
        candidates = range(1, current_max + 1)
        lam = random_state.uniform(lam_low, lam_high)
        probs = np.array([np.exp(-i * lam) for i in candidates])
        probs /= sum(probs)
        selected = random_state.choice(candidates, p=probs)
        counts.append(selected)
        current_max -= selected

    assert sum(counts) == total
    return counts
Example #13
0
def split_dataset(df, validation_percentage, seed):
    state = RandomState(seed)
    validation_indexes = state.choice(
        df.index, int(len(df.index) * validation_percentage), replace=False)
    training_set = df.loc[~df.index.isin(validation_indexes)]
    validation_set = df.loc[df.index.isin(validation_indexes)]
    return training_set, validation_set
Example #14
0
class RandomGenerator(object):
    def __init__(self, seed=None):
        self._random = RandomState(seed=seed)

    def seed(self, seed):
        self._random = RandomState(seed=seed)

    def random(self):
        return self._random.rand()

    def randint(self, a, b=None):
        if b is None:
            b = a
            a = 0
        r = self._random.randint(a, high=b, size=1)
        return r[0]

    def sample(self, population, k):
        if k == 0:
            return []
        return list(self._random.choice(population, size=k, replace=False))

    def __getattr__(self, attr):
        return getattr(self._random, attr)

    def __getstate__(self):
        return {'_random': self._random}

    def __setstate__(self, d):
        self._random = d['_random']

    def uniform(self, low=0.0, high=1.0, size=None):
        return self._random.uniform(low, high, size)
Example #15
0
def sample_roi_repr(roi_generator,
                    sample_per_batch,
                    nb_samples,
                    repr_model,
                    batch_size=32,
                    random_seed=12345,
                    q_size=20):
    '''Sample candidate ROIs and then extract their DL representations
    '''
    samples_seen = 0
    repr_list = []
    roi_q = []  # a queue for candid ROIs before they are scored.
    while samples_seen < nb_samples:
        rng = RandomState(samples_seen + random_seed)
        X, w = roi_generator.next()
        w /= w.sum()
        ri = rng.choice(len(X), sample_per_batch, replace=False, p=w)
        roi_q.append(X[ri])
        samples_seen += len(ri)
        if len(roi_q) >= q_size:
            X_q = np.concatenate(roi_q)
            repr_list.append(repr_model.predict(X_q, batch_size=batch_size))
            roi_q = []
    if len(roi_q) > 0:
        X_q = np.concatenate(roi_q)
        repr_list.append(repr_model.predict(X_q, batch_size=batch_size))
        roi_q = []
    return np.concatenate(repr_list)
Example #16
0
def test_chi2mixture():
    dof = 2
    mixture = 0.2
    n = 100

    random = RandomState(1)
    x = random.chisquare(dof, n)
    n0 = int((1 - mixture) * n)
    idxs = random.choice(n, n0, replace=False)
    x[idxs] = 0

    chi2mix = Chi2Mixture(
        scale_min=0.1,
        scale_max=5.0,
        dof_min=0.1,
        dof_max=5.0,
        qmax=0.1,
        tol=4e-3,
        lrt=x,
    )
    chi2mix.estimate_chi2mixture(x)
    pv = chi2mix.sf([0.0, 0.2])
    assert_allclose(pv, [0.19999999999999996, 0.1412935752078675])
    assert_allclose(chi2mix.scale, 1.9808080808080812)
    assert_allclose(chi2mix.dof, 0.891919191919192)
    assert_allclose(chi2mix.mixture, 0.199999999999999960)
Example #17
0
def flexible_values(val, size=1, random_state=None, min=-np.inf, max=np.inf):
    """Flexibly determine a number of values.

    Input format can be:
        - A numeric value, which will be used exactly.
        - A list of possible values, which will be randomly chosen from.
        - A tuple of (dist, arg0[, arg1, ...]), which will be used to generate
          random observations from a scipy random variable.

    """
    if random_state is None:
        random_state = RandomState()

    if np.isscalar(val):
        out = np.ones(size, np.array(val).dtype) * val
    elif isinstance(val, list):
        out = random_state.choice(val, size=size)
    elif isinstance(val, tuple):
        rv = getattr(stats, val[0])(*val[1:])
        out = truncated_sample(rv, size, min, max, random_state=random_state)
    else:
        raise TypeError("`val` must be scalar, set, or tuple")

    if size == 1:
        out = out.item()

    return out
Example #18
0
class RandomGenerator(object):
    def __init__(self, seed=None):
        self._random = RandomState(seed=seed)

    def random(self):
        return self._random.rand()

    def randint(self, a, b=None):
        if b is None:
            b = a
            a = 0
        r = self._random.randint(a, high=b, size=1)
        return r[0]

    def sample(self, population, k):
        if k == 0:
            return []
        return self._random.choice(population, size=k, replace=False)

    def __getattr__(self, attr):
        return getattr(self._random, attr)

    def __getstate__(self):
        return {'_random': self._random}

    def __setstate__(self, d):
        self._random = d['_random']
Example #19
0
def sample_from_every_class(y, size, seed=None):
    """ Get a random sample, ensuring every class is represented.

        This helper function is useful when the sample size is small and
        we want to make sure that at least one sample from each class
        is included. This is required, for example, in logistic regression,
        where the classifier cannot handle classes where it has never seen
        any training examples.

        Params
        ------
        y : 1-dimensional numpy array
            The label/output array.
        size : int
            The desired number of samples.
        seed : RandomState object or None
            Provide a seed for reproducibility.

        Returns
        -------
        samples : numpy array of shape [size]
            The random samples.
    """
    if seed is None:
        seed = RandomState(1234)

    # Keep track of the classes which have not been sampled yet
    labels = np.unique(y)
    samples = []
    while len(samples) < size:
        idx = seed.choice(np.arange(len(y)))
        if (len(labels) == 0 and idx not in samples) or y[idx] in labels:
            samples.append(idx)
            labels = np.delete(labels, np.argwhere(labels == y[idx]))
    return samples
Example #20
0
def test_ransac():
    '''
    Test RandomSampleConsensus
    '''
    num = 1000
    rng = RandomState()
    scale = 10
    points = np.zeros((num, 3), 'f8')
    for i in range(num):
        points[i, 0] = rng.rand() * scale
        points[i, 1] = rng.rand() * scale
        if i % 2 == 0:
            points[i, 2] = -points[i, 0] - points[i, 1]
        else:
            points[i, 2] = rng.rand() * scale
    cloud = pcl.PointCloud(points, fields=['x', 'y', 'z'])
    ransac = ps.RandomSampleConsensus(ps.SampleConsensusModelPlane(cloud))
    assert len(ransac.get_random_samples(rng.choice(num, 50), 30)) == 30
    ransac.max_iterations = 1000
    ransac.distance_threshold = 0.1
    ransac.compute_model()
    assert len(ransac.model_coefficients) == 4
    assert len(ransac.inliers) > 0
    assert len(ransac.model) == 3
    ransac.refine_model()
    assert len(ransac.model_coefficients) == 4
    assert len(ransac.inliers) > 0
    assert len(ransac.model) == 3
Example #21
0
def create(seed,
           head_prob = 0.8,
           two_col_prob = 0.3,
           section_range = [5,9]):
    '''
    Creates the same html for a given seed
    '''
    rand = RandomState(seed)
    soup = BeautifulSoup(_template, 'html.parser')
    if rand.rand() < head_prob:
        soup.body.insert(0, create_header(rand, soup, level=1))
    content = soup.body.div
    if rand.rand() < two_col_prob:
        content['class'] = 'col2'
    def append_section(new_elem, header_level = 0):
        div = soup.new_tag('div')
        if header_level > 0:
            div.append(create_header(rand, soup, level = header_level))
        div.append(new_elem)
        content.append(div)
    actions = [lambda:append_section(create_paragraph(rand, soup)),
               lambda:append_section(create_table(rand, soup), header_level = 3),
               lambda:append_section(create_list(rand, soup), header_level = 3)]
    section_count = sample_discrete_normal(rand, *section_range)
    for _sec_i in xrange(section_count):
        action = rand.choice(actions)
        action()
        
    return soup
    def test_01(self):
        n = 64
        k = 4
        random_instance = RandomState()
        challenges = [random_instance.choice((-1, +1), n) 
            for i in range(1000)]
        challenges = array(challenges, dtype=int8)
        weights = random_instance.normal(loc=0, scale=1, size=(k, n))

        transformed_challenges = ph.transform_id(challenges, k)

        ph_solution = ph.eval_sign(transformed_challenges, weights)
        
        numpy_solution = sign(
            transpose(
                array([
                    dot(
                        transformed_challenges[:,l],
                        weights[l]
                    )
                    for l in range(k)
                ])
            )).astype(int)

        numpy_solution = array(numpy_solution, dtype=int8, copy=True)

        assert_array_equal(
            ph_solution,
            numpy_solution,
            "Comparison of eval_sign with numpy fails."
        )
def load_dataset(song_folder_name='song_data',
                 artist_folder='artists',
                 nb_classes=20, random_state=42):
    """This function loads the dataset based on a location;
     it returns a list of spectrograms
     and their corresponding artists/song names"""

    # Get all songs saved as numpy arrays in the given folder
    song_list = os.listdir(song_folder_name)

    # Load the list of artists
    artist_list = os.listdir(artist_folder)

    # select the appropriate number of classes
    prng = RandomState(random_state)
    artists = prng.choice(artist_list, size=nb_classes, replace=False)

    # Create empty lists
    artist = []
    spectrogram = []
    song_name = []

    # Load each song into memory if the artist is included and return
    for song in song_list:
        with open(os.path.join(song_folder_name, song), 'rb') as fp:
            loaded_song = dill.load(fp)
        if loaded_song[0] in artists:
            artist.append(loaded_song[0])
            spectrogram.append(loaded_song[1])
            song_name.append(loaded_song[2])

    return artist, spectrogram, song_name
Example #24
0
    def do(self, img_obj: ImageEntity, pattern_obj: ImageEntity, random_state_obj: RandomState) -> ImageEntity:
        """
        Perform the described merge operation
        :param img_obj: The input object into which the pattern is to be inserted
        :param pattern_obj: The pattern object which is to be inserted into the image
        :param random_state_obj: used to sample from the possible valid locations, by providing a random state,
                                 we ensure reproducibility of the data
        :return: the merged object
        """
        img = img_obj.get_data()
        pattern = pattern_obj.get_data()
        num_chans = img.shape[2]
        if num_chans != 4:
            raise ValueError("Alpha Channel expected!")
        # find valid locations & remove bounding box
        i_rows, i_cols, _ = img.shape
        p_rows, p_cols, _ = pattern.shape

        # TODO: remove edges of image so that the patch always stays within
        #  the image
        valid_indices = np.where(img[0:i_rows-p_rows, 0:i_cols-p_cols, 3] != 0)
        num_valid_indices = len(valid_indices[0])
        random_index = random_state_obj.choice(num_valid_indices)
        insert_loc = [valid_indices[0][random_index],
                      valid_indices[1][random_index]]
        insert_loc_per_chan = np.tile(insert_loc, (4, 1)).astype(int)

        logger.debug("Selected insertion location randomly from available locations")

        inserter = InsertAtLocation(insert_loc_per_chan)
        inserted_img_obj = inserter.do(img_obj, pattern_obj, random_state_obj)

        return inserted_img_obj
Example #25
0
    def __getitem__(self, index):

        person_id = self.sort_keys[index]  # 找到str的person id
        nori_ids_list = self.pkl[person_id]['nori_id']

        rng = RandomState()
        nori_ids = rng.choice(nori_ids_list,
                              self.num_instance,
                              replace=(len(nori_ids_list) < self.num_instance))

        img_list = []
        nori_list = []
        for nori_id in nori_ids:

            market_img = self.nf.get(nori_id)
            texture_img = imdecode(market_img)

            while texture_img is None or texture_img.shape[
                    0] <= 0 or texture_img.shape[1] <= 0:

                new_nori_id = np.random.randint(0, len(nori_ids_list))
                market_img = self.nf.get(nori_ids[new_nori_id])
                texture_img = imdecode(market_img)

            texture_img = self.random_flip(texture_img)
            texture_img = self.to_tensor(texture_img)
            img_list.append(texture_img)
            nori_list.append(nori_id)

        idx_list = [index] * self.num_instance
        #texture_img_path = self.data[index]
        #texture_img = cv2.imread(texture_img_path)
        return img_list, idx_list
Example #26
0
def _sample_data(X: pd.DataFrame, y: List[Any], n_sample: int,
                 rng: random.RandomState) -> Tuple[pd.DataFrame, List[Any]]:
    if n_sample <= 0:
        return X, y
    else:
        indices = rng.choice(range(len(X)), n_sample, replace=True)
        return X.iloc[indices, :], itemgetter(*indices)(y)
Example #27
0
def assign_random_gt(input_vcf, output, sample_name="HG", default_af=0.01, seed=None):
    vcf_pointer = pysam.VariantFile(filename=input_vcf)
    new_header = vcf_pointer.header.copy()
    if "GT" not in new_header.formats:
        new_header.formats.add("GT", "1", "String", "Consensus Genotype across all datasets with called genotype")
        new_header.samples.add(sample_name)
    output.write(str(new_header))

    default_probs = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af]
    rng = RandomState(seed)
    previous_pos = 0
    for rec in vcf_pointer.fetch():
        rec_copy = rec.copy()
        if "GT" not in rec_copy.format.keys():
            if rec_copy.pos == previous_pos:
                c = "0|0"
            else:
                if "AF" not in rec_copy.info.keys():
                    gt_probs = default_probs
                else:
                    af = rec_copy.info["AF"]
                    gt_probs = [1 - af * (1 + af), af/2, af/2, af * af]
                c = rng.choice(["0|0", "0|1", "1|0", "1|1"], p=gt_probs)
            output.write("\t".join([str(rec_copy)[:-1], "GT", c]) + "\n")
        previous_pos = rec_copy.pos

    vcf_pointer.close()
Example #28
0
def __permute(estimator, X, y, best_score, scorer, random_state):
    """
    Permute each predictor and measure difference from best score

    Args
    ----
    estimator (object): scikit learn estimator
    X, y: 2d and 1d numpy arrays data and labels from a test partition
    best_score (float): best scorer obtained on unperturbed data
    scorer (object): scoring method to use to measure importances
    random_state (float): random seed

    Returns
    -------
    scores (2D numpy array): scores for each predictor following permutation
    """

    from numpy.random import RandomState
    rstate = RandomState(random_state)

    # permute each predictor variable and assess difference in score
    scores = np.zeros(X.shape[1])

    for i in range(X.shape[1]):
        Xscram = np.copy(X)
        Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])

        # fit the model on the training data and predict the test data
        y_pred = estimator.predict(Xscram)
        scores[i] = best_score - scorer(y, y_pred)
        if scores[i] < 0:
            scores[i] = 0

    return scores
Example #29
0
File: GA.py Project: lisabang/iqsar
 def mkeindseed(self,desc_in_ind=5):
     if self.mkeindseed.count<=100:
         prng=RandomState(self.seed+self.mkeindseed.count)
     if self.mkeindseed.count>100:
         prng=RandomState(self.seed+(self.mkeindseed.count%100))
     smple=prng.choice(self.basetable.columns,size=desc_in_ind, replace=False)
     return list(smple)
Example #30
0
def test_bboxes():
    PRNG = RandomState()
    PRNG2 = RandomState()
    if args.seed > 0:
        PRNG.seed(args.seed)
        PRNG2.seed(args.seed)

    transform = Compose(
        [
            [ColorJitter(prob=0.5)],  # or write [ColorJitter(), None]
            BoxesToCoords(),
            HorizontalFlip(),
            Expand((1, 4), prob=0.5),
            ObjectRandomCrop(),
            Resize(300),
            CoordsToBoxes(),
            #[SubtractMean(mean=VOC.MEAN)],
        ],
        PRNG,
        mode=None,
        fillval=VOC.MEAN,
        outside_points='clamp')

    viz = Viz()
    voc_dataset = VOCDetection(root=args.root,
                               image_set=[('2007', 'trainval')],
                               keep_difficult=True,
                               transform=transform)

    results = []
    count = 0
    i = PRNG2.choice(len(voc_dataset))
    for _ in range(100):
        img, boxes, labels = voc_dataset[i]
        if len(labels) == 0:
            continue

        img = viz.draw_bbox(img, boxes, labels, True)
        results.append(img)
        cv2.imshow('0', img[:, :, ::-1])
        c = cv2.waitKey(500)
        if c == 27 or c == ord('q'):  # ESC / 'q'
            break
        elif c == ord('c') or count >= 5:
            count = 0
            i = PRNG2.choice(len(voc_dataset))
        count += 1
Example #31
0
class ClassBalancedBatchSizeIterator(object):
    """
    Create batches of balanced size, that are also balanced per class, i.e.
    each class should be sampled roughly with the same frequency during
    training.

    Parameters
    ----------
    batch_size: int
        Resulting batches will not necessarily have the given batch size
        but rather the next largest batch size that allows to split the set into
        balanced batches (maximum size difference 1).
    seed: int
        Random seed for initialization of `numpy.RandomState` random generator
        that shuffles the batches.
    """

    def __init__(self, batch_size, seed=328774):
        self.batch_size = batch_size
        self.seed = seed
        self.rng = RandomState(self.seed)

    def get_batches(self, dataset, shuffle):
        n_trials = len(dataset.X)
        batches = get_balanced_batches(
            n_trials, batch_size=self.batch_size, rng=self.rng, shuffle=shuffle
        )
        if shuffle:
            n_classes = np.max(dataset.y) + 1
            class_probabilities = [
                np.mean(dataset.y == i_class) for i_class in range(n_classes)
            ]
            class_probabilities = np.array(class_probabilities)
            # choose trials in inverse probability of class
            trial_probabilities = [
                1.0 / class_probabilities[y] for y in dataset.y
            ]
            trial_probabilities = np.array(trial_probabilities) / np.sum(
                trial_probabilities
            )
            i_trial_to_balanced = self.rng.choice(
                n_trials, n_trials, p=trial_probabilities
            )

        for batch_inds in batches:
            if shuffle:
                batch_inds = [
                    i_trial_to_balanced[i_trial] for i_trial in batch_inds
                ]
            batch_X = dataset.X[batch_inds]
            batch_y = dataset.y[batch_inds]

            # add empty fourth dimension if necessary
            if batch_X.ndim == 3:
                batch_X = batch_X[:, :, :, None]
            yield (batch_X, batch_y)

    def reset_rng(self):
        self.rng = RandomState(self.seed)
Example #32
0
class RandomAgent(Agent):
    def __init__(self):
        # TODO: move seed into an argument.
        self.rng = RandomState(self.config['random_seed'])
        return

    def choose_actions(self, observations, infos, dones):
        return [self.rng.choice(info['admissible_commands']) for info in infos]
Example #33
0
def id_generator(size=6,
                 chars=string.ascii_uppercase + string.digits,
                 seed=None):
    #
    rnd = RandomState()
    if seed is not None:
        rnd = RandomState(seed)
    #
    return ''.join(rnd.choice(list(chars)) for _ in range(size))
def main(args, model_dir):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    logger.info('Get sentences...')
    train_sents, _ = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True)
    dev_sents, _ = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True)
    test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True)

    train_sents = train_sents + dev_sents
    train_sents_filtered = list(filter(_is_not_meta_sentence, train_sents))

    sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
    rs = RandomState(seed=args.random_seed)
    train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
    logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
                sample_size, len(train_sents_filtered), args.train_sample_frac)

    logger.info('Compute features...')
    feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[args.feature_extractor]
    X_train, y_train = crf_labeler.sents_to_features_and_labels(train_sents_sample,
                                                                feature_extractor)
    X_test, _ = crf_labeler.sents_to_features_and_labels(test_sents, feature_extractor)

    logger.info('len(X_train) = {}'.format(len(X_train)))
    logger.info('len(y_train) = {}'.format(len(y_train)))
    logger.info('len(X_test) = {}'.format(len(X_test)))

    crf = crf_labeler.SentenceFilterCRF(
        ignore_sentence=meta_sentence_filter,
        ignored_label='O',
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    logger.info('Start training... {}'.format(crf))
    crf.fit(X_train, y_train)

    logger.info('CRF classes: {}'.format(crf.classes_))

    logger.info('Make predictions...')
    y_pred_test = crf.predict(X_test)

    logger.info('Start evaluation...')
    evaluator = Evaluator(gold=corpus.test,
                          predicted=tagging_utils.sents_to_standoff(y_pred_test, test_docs))

    entity_level_metric = evaluator.entity_level()
    logger.info('\n{}', entity_level_metric)
    entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv'))
    evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv'))
    evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')
Example #35
0
class Random(Subset):
    def __init__(self, **kwargs):
        self.size = kwargs['size']
        self.rs = RandomState(kwargs.get('seed'))

    def generate(self, backdoor: Backdoor):
        size = min(self.size, len(backdoor))
        variables = backdoor.snapshot()
        return Backdoor(self.rs.choice(variables, size, replace=False))
Example #36
0
def main(args, model_dir):
    logger.info('Args = {}'.format(args))
    corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus])
    tokenizer = TokenizerFactory().tokenizer(args.corpus)
    logger.info('Loaded corpus: {}'.format(corpus))

    logger.info('Get sentences...')
    train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True)
    dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True)
    test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test,
                                                                tokenizer, verbose=True)

    train_sents = train_sents + dev_sents
    train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents))

    sample_size = int(len(train_sents_filtered) * args.train_sample_frac)
    rs = RandomState(seed=args.random_seed)
    train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist()
    logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)',
                sample_size, len(train_sents_filtered), args.train_sample_frac)

    # We need to pass some dev data, otherwise flair raises a ZeroDivisionError
    # See: https://github.com/zalandoresearch/flair/issues/1139
    # We just split the training sample into half and instruct Flair to train_with_dev (see below).
    half = len(train_sents_sample) // 2
    flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half],
                                              dev=train_sents_sample[half:],
                                              test=test_sents,
                                              ignore_sentence=_ignore_sentence)
    logger.info(flair_corpus)

    logger.info('Train model...')
    tagger = run_bilstmcrf.get_model(flair_corpus,
                                     corpus_name=args.corpus,
                                     embedding_lang=args.embedding_lang,
                                     pooled_contextual_embeddings=True)

    trainer = ModelTrainer(tagger, flair_corpus)
    trainer.train(join(model_dir, 'flair'),
                  max_epochs=150,
                  monitor_train=False,
                  train_with_dev=True,
                  save_final_model=args.save_final_model)

    logger.info('Make predictions...')
    run_bilstmcrf.make_predictions(tagger, flair_corpus)

    logger.info('Start evaluation...')
    evaluator = Evaluator(gold=corpus.test,
                          predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs))

    entity_level_metric = evaluator.entity_level()
    logger.info('\n{}', entity_level_metric)
    entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv'))
    evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv'))
    evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv'))
    logger.info('Done.')
Example #37
0
 def mkeindseed(self, desc_in_ind=5):
     if self.mkeindseed.count <= 100:
         prng = RandomState(self.seed + int(self.mkeindseed.count))
     if self.mkeindseed.count > 100:
         prng = RandomState(self.seed + int((self.mkeindseed.count % 100)))
     smple = prng.choice(self.basetable.columns,
                         size=desc_in_ind,
                         replace=False)
     return list(smple)
def test_phase_equal_after_bandpower_mean():
    rng = RandomState(3098284)
    inputs = rng.randn(50,20,1001,1)
    targets = rng.choice(4, size=50)
    target_arr = np.zeros((50,4))
    target_arr[:,0] = targets == 0
    target_arr[:,1] = targets == 1
    target_arr[:,2] = targets == 2
    target_arr[:,3] = targets == 3
    mod_inputs, mod_targets = BandpowerMeaner().process(inputs, target_arr)
    assert np.allclose(np.angle(np.fft.rfft(inputs, axis=2)),
            np.angle(np.fft.rfft(mod_inputs, axis=2)), rtol=1e-4, atol=1e-5)
    assert np.array_equal(target_arr, mod_targets)                          
Example #39
0
def test_random_choice():
    """nestle.random_choice() is designed to mimic np.random.choice(),
    for numpy < v1.7.0. In cases where we have both, test that they agree.
    """
    rstate = RandomState(0)
    p = rstate.rand(10)
    p /= p.sum()
    for seed in range(10):
        rstate.seed(seed)
        i = rstate.choice(10, p=p)
        rstate.seed(seed)
        j = nestle.random_choice(10, p=p, rstate=rstate)
        assert i == j
Example #40
0
def replace_characters(token, index_to_char, n=1, char_pool=string.ascii_lowercase, seed=17):
    if isinstance(seed, RandomState):
        rng = seed
    else:
        rng = RandomState(seed)

    new_token = token
    for i in six.moves.range(n):
        idx = max(1, rng.randint(len(new_token)))
        #ch = index_to_char[rng.randint(len(index_to_char))]
        ch = rng.choice(list(char_pool))
        new_token = unicode(new_token[0:idx-1] + ch + new_token[idx:])
    return new_token
Example #41
0
def rarefy_seqs(in_filename, out_filename, depth=1000, fmt="fastq", seed=0):
    """Rarefy a sequence file.
    """

    prng = RandomState(seed)

    records = SeqIO.index(in_filename, fmt)
    record_ids = [record_id for record_id in records.iterkeys()]
    record_ids = prng.choice(record_ids, replace=False, size=depth)

    out_handle = open(out_filename, 'w')
    for record_id in record_ids:
        SeqIO.write(records[record_id], out_handle, fmt)
    out_handle.close()
Example #42
0
def choose_random_nodes(G, ntr = 1, n_edges = 1):
    '''
    Returns a random set of absorbing nodes, if the other nodes in the
    graph form a connected component after removing the absorbing nodes.
    Otherwise it returns None
    Parameters
    ----------
    G : Networkx graph
        The graph from which the team will be selected.
    ntr : the number of absorbing nodes
    Returns
    -------
    
    nodes_to_remove : 
        The list of nodes in t the graph to be made absorbing.
    is_viable:
        Boolean indicating whether the graph will stay connected after making
        the nodes absorbing, meaning wheather the partition is viable
    '''
    prng = RandomState()
    order = array(G.nodes())
    nodes_to_remove = list(prng.choice(order, ntr, replace=False ))
    
    H = G.copy()
    H.remove_nodes_from(nodes_to_remove)
    if G.is_directed():
        n_components = nx.number_strongly_connected_components(H)
    else:
        n_components =  nx.number_connected_components(H)
        
    if n_components == 1:
        is_viable = True
        if G.is_directed():
            for node in nodes_to_remove:
                if (H.number_of_nodes() - len(set(G.predecessors(node)) 
                                           - set(nodes_to_remove))) < n_edges:
                    is_viable = False
                    break  
        else:
            for node in nodes_to_remove:
                if (H.number_of_nodes() - len(set(G.neighbors(node)) 
                                           - set(nodes_to_remove))) < n_edges:
                    is_viable = False
                    break   
        return nodes_to_remove, is_viable
        
    else:
        is_viable = False
        return nodes_to_remove, is_viable
Example #43
0
def generateDegradation(args, seed):
    from numpy.random import RandomState
    from numpy.linalg import norm

    rs = RandomState(seed)

    if args.D == 2:
        rotation = (rs.uniform(*args.rotate),)
    if args.D == 3:
        angle = rs.uniform(*args.rotate)
        axis = rs.uniform(size=3)
        axis = axis/norm(axis)
        rotation = angle, axis
    translation = rs.uniform(*args.translate, size=args.D)
    scale = rs.uniform(*args.scale)
    if args.drop[0] == args.drop[1]:
        ndrops = args.drop[0]
    else:
        ndrops = rs.randint(*sorted(args.drop))
    drops = rs.choice(range(args.N), size=ndrops, replace=False)
    duplications = rs.choice(range(args.duplicate[0], args.duplicate[1] + 1), size=args.N - ndrops)
    noise = rs.uniform(*args.noise) * rs.randn(sum(duplications), args.D)

    return rotation, translation, scale, drops, duplications, noise
Example #44
0
    def rarefaction(M, seed=0):
        """
            taken from the below link
            http://stackoverflow.com/posts/18967204/revisions
        """
        prng = RandomState(seed) # reproducible results
        noccur = np.sum(M, axis=1) # number of occurrences for each sample
        nvar = M.shape[1] # number of variables
        depth = np.min(noccur) # sampling depth

        Mrarefied = np.empty_like(M)
        for i in range(M.shape[0]): # for each sample
            p = M[i] / float(noccur[i]) # relative frequency / probability
            choice = prng.choice(nvar, depth, p=p)
            Mrarefied[i] = np.bincount(choice, minlength=nvar)

        return Mrarefied
Example #45
0
def rarefy_otu_table(in_filename, out_filename, perc=80, seed=0):
    """Rarefy OTU table to perc% of the least abundant sample. 
    """

    # Read OTU table
    otu_table = pd.read_csv(in_filename, sep='\t', index_col=0)
        
    # rarefaction
    prng = RandomState(seed)
    n_reads = otu_table.sum()
    depth = n_reads.min() * (perc / 100)
    for sample in otu_table:
        prob = otu_table[sample] / n_reads[sample]
        choice = prng.choice(otu_table.shape[0], depth, p=np.asarray(prob))
        otu_table[sample] = np.bincount(choice, minlength=otu_table.shape[0])

    # OTU pruning
    otu_table = otu_table.loc[otu_table.sum(axis=1) > 0]

    # Write OTU table
    otu_table.to_csv(out_filename, sep='\t')
Example #46
0
    def __init__(self, X, y, dataset, policy_name, scale=True, n_iter=10, passive=True):
        seed = RandomState(1234)
        self.X = np.asarray(X, dtype=np.float64)
        self.y = np.asarray(y)
        self.X = StandardScaler().fit_transform(self.X) if scale else self.X
        self.policy_name = policy_name
        self.dataset = dataset
        self.passive = passive

        # estimate the kernel using the 90th percentile heuristic
        random_idx = seed.choice(X.shape[0], 1000)
        distances = pairwise_distances(self.X[random_idx], metric='l1')
        self.gamma = 1 / np.percentile(distances, 90)
        transformer = RBFSampler(gamma=self.gamma, random_state=seed, n_components=100)
        self.X_transformed = transformer.fit_transform(self.X)

        n_samples = self.X.shape[0]
        train_size = min(10000, int(0.7 * n_samples))
        test_size = min(20000, n_samples - train_size)
        self.kfold = StratifiedShuffleSplit(self.y, n_iter=n_iter, test_size=test_size,
                                            train_size=train_size, random_state=seed)
def generate_by_group(frame, by, source_map, source_cols=None, seed=None):
    """
    Adds columns to a trial list from multiple sources.
    
    Splits a trial list into chunks to add columns from various sources. Chunks
    are paired with sources based on unique values in `frame[by]`. See 
    :func:`generate` for more details.
    
    :param pandas.DataFrame frame: Trial list.
    :param str by: Grouping column in `frame`. Unique values are used as keys to
        get sources from `source_map`.
    :param dict source_map: Container of source lists. Keys are unique values of
        `frame[by]`. Values are pandas.DataFrame sources.
    :param source_cols: Columns of `source` to add to `frame`. Defaults to
        adding all columns of `source`. If `source_cols` is a dict, keys will be 
        renamed to values.
    :type source_cols: str, list, dict, or None
    :param seed: Seed random number generator. If `None` the result will not be
        randomized.
    :type seed: int or None
    :return: The `frame` with additional `source_cols` from `source`.
    :rtype: pandas.DataFrame
    """
    # create unique seeds for each part
    num_seeds = len(frame[by].unique()) + 1
    if seed is not None:
        prng = RandomState(seed)
        seeds = list(prng.choice(arange(1000), num_seeds))
    else:
        seeds = [None]*num_seeds
        
    def _generate_for_group(grp):
        group_key = grp[by].unique()[0]
        group_source = source_map[group_key]
        group_frame = generate(grp, group_source, source_cols, seeds.pop())
        group_frame.index = grp.index
        return group_frame
    
    return frame.groupby(by, group_keys=False).apply(_generate_for_group)
def expand(valid, name, values=[1,0], ratio=0.5, sample=False, seed=None):
    """
    Copy rows as necessary to satisfy the valid:invalid ratio.
        
    Use when complete counterbalancing is not plausible. For example, when the 
    ratio of trials requiring response A to those requiring response B is not
    50:50.
    
    :param pandas.DataFrame valid: Trial list to be expanded.
    :param str name: Name of new column containing valid and invalid values
    :param list values: Values for valid and invalid trials, respectively.
    :param float ratio: Desired percentage of valid trials in the resulting 
        frame. Must be between 0 and 1. Defaults to 0.5.
    :param bool sample: Should the invalid trials be sampled from the valid 
        trials? If True, len(returned) < 2*len(valid). Defaults to False.
    :param seed: Seed random number generator.
    :type seed: int or None
    :return: New trial list with valid and invalid trials are denoted in a 
        new column.
    :rtype: pandas.DataFrame
    """
    prng = RandomState(seed)
    num_trials = len(valid)
    
    if not sample:
        invalid = valid[:]
        num_valid = (num_trials*ratio)/(1.0-ratio)
        copies = int(num_valid/num_trials)
        valid = pd.concat([valid]*copies, ignore_index=True)
    else:
        num_invalid = int((num_trials*(1.0-ratio))/ratio)
        sampled = prng.choice(valid.index, num_invalid, replace=False)
        invalid = valid.reindex(sampled).reset_index(drop=True)
    
    frame = pd.concat([valid, invalid], keys=values, names=[name,'DEFAULT'])
    frame = frame.reset_index().drop('DEFAULT', axis=1)
    return frame
Example #49
0
    # for i in range(len(nsgk)):
    for i in range(20):  # loop over categories
        d = []          # list of the permutation distributions for each video
        tst = []        # list of test statistics for each video
        for j in range(len(x[i])):  # loop over videos
            res = simulate_ts_dist(x[i][j], keep_dist=True)
            d.append(res['dist'])
            tst.append(res['obs_ts'])
        perm_distr = np.asarray(d).transpose()
        category.append(
            simulate_npc_dist(perm_distr, size=time_stamps, obs_ts=tst))
    category_pvalues = []
    for i in range(len(category)):
        category_pvalues.append(category[i]['pvalue'])

freq = RNG.choice([0.2, 0.8], Ns)
res2 = np.zeros((R, Ns))

for i in range(len(freq)):
    res2[:, i] = RNG.binomial(1, freq[i], R)


def test_irr_concordance():
    rho_s2 = compute_ts(res2)
    assert_almost_equal(rho_s2, 0.70476190476190481)


def test_simulate_ts_dist_concordance():
    expected_res_conc = {'dist': None,
                         'geq': 0,
                         'obs_ts': 0.70476190476190481,
Example #50
0
fname = root_path + 'data0_berg.json'
data0_berg = utils_local.load_data0(fname=fname)
item_types = ['bags', 'ties', 'earrings', 'shoes']


data = {}
data['items'] = []


for item_type in item_types:
    N = len(data0_berg[item_type])
    print "item_type", item_type
    print "number of items in ", item_type, ": ", N

    N20 = int(0.2 * N)  # 20% of the data
    test_val_split = prng.choice(N, N20, replace=False)  # randomly choose 2000 imgid for test and validations

    print "len test val split", len(test_val_split)
    # print test_val_split

    N10 = int(0.1 * N)  # 10% of the data
    test_split = prng.choice(test_val_split, N10, replace=False)
    print "len test split", len(test_split)
    #print test_split

    for item in data0_berg[item_type]:

        new_item = mk_new_item(item)
        # add item to data
        data['items'].append(new_item)
class RandomizationTests(object):
    """Randomization tests for two-sample comparison with user-defined test
    statistic
    """

    def __init__(
            self,
            measure_central_tendency=None,
            name='Arithmetic mean',
            method='monte',
            alternative='two_sided',
            seed=None):
        """Constructor"""
        self.mct = measure_central_tendency
        self.name = name
        self.method = method
        self.alternative = alternative
        self._1stexe = None  # first execution
        self._tobs = None  # observed test statistic value
        self._mctA = None
        self._mctB = None
        self._nA = None
        self._data = None
        self._indices = None
        self._count = 0
        self._nperm = 0  # Number of permutations
        self.pvalue = None
        self._prng = RandomState(seed)

    @property
    def mct(self):
        return self._mct

    @mct.setter
    def mct(self, value):
        if isinstance(value, types.FunctionType):
            self._mct = value
        else:
            self._mct = self._aritmetic_mean

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        if value:
            if isinstance(value, str):
                self._name = value

    @property
    def method(self):
        return self._method

    @method.setter
    def method(self, value):
        assert value in ['systematic', 'monte']
        self._method = value

    @property
    def alternative(self):
        return self._alternative

    @alternative.setter
    def alternative(self, value):
        assert value in ['two_sided', 'greater', 'less']
        self._alternative = value

    def _compute_test_statistic(self, x, y):
        return self._mct(x) - self._mct(y)

    def _aritmetic_mean(self, x):
        return sum(x) / len(x)

    def _process_data_permutation(self, groupA_indices):
        self._nperm += 1
        groupB_indices = set(self._indices).difference(groupA_indices)
        t = self._compute_test_statistic(
            [self._data[i] for i in groupA_indices],
            [self._data[j] for j in groupB_indices],
            )
        if self._alternative == 'two_sided':
            self._count += 1 if abs(t) >= abs(self._tobs) else 0
        elif self._alternative == 'greater':
            self._count += 1 if t >= self._tobs else 0
        else:
            self._count += 1 if t <= self._tobs else 0

    def execute(self, x=None, y=None, number_of_permutations=10000):
        if x and y:
            if isinstance(x, (list, tuple)) and isinstance(y, (list, tuple)):
                if all([isinstance(i, (int, float)) for i in x]):
                    self._nA = len(x)
                    self._mctA = self._mct(x)
                else:
                    raise TypeError('Elements in x should be numbers')
                if all([isinstance(j, (int, float)) for j in y]):
                    self._mctB = self._mct(y)
                else:
                    raise TypeError('Elements in y should be numbers')
            self._1stexe = True
            self._tobs = self._compute_test_statistic(x, y)
            self._data = x + y
            self._indices = range(self._nA + len(y))
        assert isinstance(number_of_permutations, int)
        if self.method == 'monte':
            # Monte Carlo randomization test with valid p value, i.e.,
            # include t_obs to reference set
            if self._1stexe:
                self._1stexe = False
                self._count += 1
                self._nperm += 1
                number_of_permutations -= 1
            for _ in range(number_of_permutations):
                groupA_indices = self._prng.choice(
                    self._indices,
                    self._nA,
                    replace=False,
                    )
                self._process_data_permutation(groupA_indices)
        else:
            for groupA_indices in combinations(self._indices, self._nA):
                self._process_data_permutation(groupA_indices)
        self.pvalue = self._count / self._nperm

    def summary(self):
        if self.method == 'systematic':
            print('Systematic randomization test for two groups')
        else:
            print('Monte Carlo randomization test for two groups')
        print('Alternative hypothesis: {0}'.format(self._alternative))
        print('{0} of group A: {1:.2f}'.format(self._name, self._mctA))
        print('{0} of group B: {1:.2f}'.format(self._name, self._mctB))
        print('Observed test statistic value: {0:.2f}'.format(self._tobs))
        print('Count: {0:d}'.format(self._count))
        print('Number of permutations: {0:d}'.format(self._nperm))
        print('p value: {0:.4f}'.format(self.pvalue))
def find_beacons_sample_inverse(G, num_of_beacons=3, seed=None):
    # Sample a beacon based on its degree
	from numpy.random import RandomState
	prng = RandomState(seed)	
	degrees = np.array(nx.degree(G).values())
	return prng.choice(np.arange(len(degrees)), num_of_beacons, p=(1./degrees) *1./sum(1./degrees), replace=False )
Example #53
0
    def _run_fold(self, train_index, test_index):
        # reset the seed
        seed = RandomState(1234)

        # split data into train and test sets
        pool = self.X_transformed[train_index]
        oracle = self.y[train_index]
        labels = np.ma.MaskedArray(oracle, mask=True, copy=True)
        X_test = self.X_transformed[test_index]
        y_test = self.y[test_index]
        n_classes = len(np.unique(y_test))
        similarity = rbf_kernel(self.X[train_index], gamma=self.gamma)
        mpba, accuracy, f1 = [], [], []
        training_size = min(1000, len(pool))
        initial_n = 50
        horizon = training_size - initial_n

        # initialise classifier
        classifier = LogisticRegression(multi_class='ovr', penalty='l2', C=1000,
                                        random_state=seed, class_weight='balanced')
        committee = BaggingClassifier(classifier, n_estimators=7, n_jobs=1, max_samples=100,
                                      random_state=seed)

        # select the specified policy
        policy = self._get_policy(self.policy_name, pool, labels, classifier,
                                  committee, seed, similarity, horizon)

        # select 50 initial random examples for labelling
        sample_idx = seed.choice(np.arange(len(pool)), initial_n, replace=False)
        policy.add(sample_idx, oracle[sample_idx])
        y_pred = classifier.predict(X_test)
        mpba.append(mpba_score(y_test, y_pred))
        accuracy.append(accuracy_score(y_test, y_pred))
        f1.append(micro_f1_score(y_test, y_pred, n_classes))

        # start running the policy
        while np.sum(~labels.mask) < training_size:
            # use the policy to select the next instance for labelling
            best_candidates = policy.select()

            # query the oracle and add label
            policy.add(best_candidates, oracle[best_candidates])

            # observe the reward
            y_pred = classifier.predict(X_test)
            mpba.append(mpba_score(y_test, y_pred))
            reward = mpba[-1] - mpba[-2]

            # also compute accuracy and f1 score
            accuracy.append(accuracy_score(y_test, y_pred))
            f1.append(micro_f1_score(y_test, y_pred, n_classes))

            # normalise the reward to [0, 1]
            reward = (reward + 1) / 2
            policy.receive_reward(reward)

        history = policy.history()
        history['mpba'] = np.array(mpba)
        history['accuracy'] = np.array(accuracy)
        history['f1'] = np.array(f1)

        return history
Example #54
0
class BaseActive:
    """ Base class for active learning. """

    def __init__(self, classifier, best_heuristic=None, accuracy_fn=compute_balanced_accuracy,
                 initial_n=20, training_size=100, sample_size=20, n_candidates=1,
                 verbose=False, random_state=None, pool_random_state=None, **h_kwargs):

        self.classifier = classifier
        self.best_heuristic = best_heuristic
        self.accuracy_fn = accuracy_fn
        self.initial_n = initial_n
        self.training_size = training_size
        self.current_training_size = 0
        self.n_candidates = n_candidates
        self.sample_size = sample_size
        self.verbose = verbose
        self.learning_curve_ = []
        self.h_kwargs = h_kwargs
        self.candidate_selections = []
        self.rng = RandomState(random_state)
        self.pool_rng = RandomState(pool_random_state)


    def _random_sample(self, pool_size, train_mask, sample_size):
        """ Select a random sample from the pool.

            Parameters
            ----------
            pool_size : int
                The total number of data points (both queried and unlabelled).

            train_mask : boolean array
                The boolean array that tells us which points are currently in the training set.

            sample_size : int
                The size of the random sample.

            Returns
            -------
            candidate_mask : boolean array
                The boolean array that tells us which data points the heuristic should examine.
        """

        candidate_mask = -train_mask

        if 0 < self.sample_size < np.sum(candidate_mask):
            unlabelled_index = np.where(candidate_mask)[0]
            candidate_index = self.rng.choice(unlabelled_index, self.sample_size, replace=False)
            candidate_mask = np.zeros(pool_size, dtype=bool)
            candidate_mask[candidate_index] = True

        return candidate_mask


    def _select_heuristic(self):
        """ Choose a heuristic to be used (useful in bandits active learning). """

        return None


    def _store_results(self, accuracy):
        """ Store results at the end of an iteration. """

        self.learning_curve_.append(accuracy)


    def _print_progress(self):
        """ Print out current progress. """
        if self.current_training_size % 1000 == 0:
            print(self.current_training_size, end='')
        elif self.current_training_size % 100 == 0:
            print('.', end='')


    def select_candidates(self, X, y, candidate_mask, train_mask):
        """ Return the indices of the best candidates.

            Parameters
            ----------
            X : array
                The feature matrix of all the data points.

            y : array
                The target vector of all the data points.

            candidate_mask : boolean array
                The boolean array that tells us which data points the heuristic should examine.

            n_candidates : int
                The number of best candidates to be selected at each iteration.

            **h_kwargs : other keyword arguments
                All other keyword arguments will be passed onto the heuristic function.

            Returns
            -------
            best_candidates : array
                The list of indices of the best candidates.
        """

        return self.best_heuristic(X=X, y=y, candidate_mask=candidate_mask,
                                   train_mask=train_mask, classifier=self.classifier,
                                   n_candidates=self.n_candidates, random_state=self.pool_rng.randint(1000),
                                   **self.h_kwargs)


    def fit(self, X_train, y_train, X_test=None, y_test=None):
        """ Conduct active learning.

            Parameters
            ----------
            X_train : array
                The feature matrix of all the data points.

            y_train : array
                The target vector of all the data points.

            X_test : array
                If supplied, this will be used to compute an accuracy score for the learning curve.

            y_test : array
                If supplied, this will be used to compute an accuracy score for the learning curve.
        """

        pool_size = X_train.shape[0]
        n_features = X_train.shape[1]

        # boolean index of the samples which have been queried and are in the training set
        train_mask = np.zeros(pool_size, dtype=bool)

        # select an initial random sample from the pool and train the classifier
        sample = self.rng.choice(np.arange(pool_size), self.initial_n, replace=False)
        self.candidate_selections += list(sample)
        train_mask[sample] = True
        self.classifier.fit(X_train[train_mask], y_train[train_mask])
        self.current_training_size += len(sample)

        # obtain the first data point of the learning curve
        if X_test is not None and y_test is not None:
            accuracy = self.accuracy_fn(self.classifier, X_test, y_test)
            self.learning_curve_.append(accuracy)

        # keep training the classifier until we have a desired sample size
        while np.sum(train_mask) < self.training_size:
            
            # select a random sample from the unlabelled pool
            candidate_mask = self._random_sample(pool_size, train_mask, self.sample_size)

            # select the heuristic to be used
            self._select_heuristic()

            # pick the index of the best candidates
            best_candidates = self.select_candidates(X_train, y_train, candidate_mask, train_mask)
            self.candidate_selections += list(best_candidates)

            # retrain the classifier
            train_mask[best_candidates] = True
            self.classifier.fit(X_train[train_mask], y_train[train_mask])
            self.current_training_size += len(best_candidates)

            # obtain the next data point of the learning curve
            if X_test is not None and y_test is not None:
                accuracy = self.accuracy_fn(self.classifier, X_test, y_test)
                self._store_results(accuracy)

            # print progress after every 100 queries
            if self.verbose:
                self._print_progress()

            assert self.current_training_size == np.sum(train_mask), \
                   'Mismatch detected in the training size. Check your heuristic.'


    def predict(self, X):
        """ Predict the target values of X given the model.

            Parameters
            ----------
            X : array
                The feature matrix

            Returns
            -------
            y : array
                Predicted values.
        """
        return self.classifier.predict(X)
Example #55
0
class ImageCollection(object):

    online = True

    def __init__(self, mode="random", random_state=2, nb=100,
                 size=(224, 224), crop=False,
                 folder=None,
                 filename_to_label=None,
                 process_dirs=None,
                 recur=False,
                 verbose=1,
                 **kwargs):
        if not hasattr(self, "folder"):
            assert folder is not None
            self.folder = folder
        if not hasattr(self, "filename_to_label"):
            if filename_to_label is None:
                def filename_to_label(directory, filename):
                    return hash(directory)
                self.filename_to_label = filename_to_label
        if not hasattr(self, "process_dirs"):
            if process_dirs is None:
                def process_dirs(dirs):
                    return filter(lambda d: os.path.isdir(d), dirs)
            self.process_dirs = process_dirs

        path = os.path.join(self.folder)

        if recur:
            directories = (root for root, _, _ in os.walk(path))
            all_dirs = list(directories)
        else:
            directories = os.listdir(path)
            all_dirs = map(lambda d: path + "/" + d, directories)

        all_dirs = self.process_dirs(all_dirs)
        self.all_dirs = all_dirs

        self.mode = mode
        self.rng = RandomState(random_state)
        self.nb = nb
        self.size = size
        self.crop = crop
        self.verbose = verbose

        if size is not None:
            self.img_dim = (size[1], size[0], 3)
        else:
            self.img_dim = None

    def load(self):
        X = []
        y = []
        if self.mode == 'random':
            def get_next():
                while True:
                    d = self.rng.choice(self.all_dirs)
                    filenames = os.listdir(d)
                    filename = self.rng.choice(filenames)
                    yield d, filename

        elif self.mode == 'all':
            def get_next():
                for d in self.all_dirs:
                    for filename in os.listdir(d):
                        yield d, filename
        else:
            raise Exception("invalid mode : {}".format(self.mode))
        get_next_iter = get_next()
        while len(X) < self.nb:
            try:
                d, filename =  next(get_next_iter)
            except StopIteration:
                break
            try:
                x = imread(d + "/" + filename)
                h, w = x.shape[0:2]
                if self.crop:
                    if h >= self.size[0]:
                        a = (h - self.size[0]) / 2
                        b = h - self.size[0] - a
                        x = x[a:-b]
                    if w >= self.size[1]:
                        a = (w - self.size[1]) / 2
                        b = w - self.size[1] - a
                        x = x[:, a:-b]
                    x = resize(x, self.size)
                else:
                    if self.size is not None:
                        x = resize(x, self.size)

            except Exception as ex:
                if self.verbose > 0:
                    print("Exception when processing {} : {}".format(filename, repr(ex)))
                continue
            if len(x.shape) == 2:
                x = x[:, :, None] * np.ones((1, 1, 3))
            if len(x.shape) == 3 and x.shape[-1] == 4:
                x = x[:, :, 0:3]
            if len(x.shape) == 3 and x.shape[-1] > 4:
                # there is an image with shape[2]=90, wtf?
                continue
            X.append(x)
            l = self.filename_to_label(d, filename)
            y.append(l)
        X = np.array(X).astype(np.float32)

        if self.img_dim is None:
            self.img_dim = X.shape[1:]
        self.X = X
        self.y = y
Example #56
0
class BatchWiseCntTrainer(object):
    def __init__(self, exp, n_updates_per_break, batch_size, learning_rate,
                n_min_trials, trial_start_offset, break_start_offset,
                break_stop_offset,
                train_param_values,
                deterministic_training=False, add_breaks=True):
        self.cnt_model = exp.final_layer
        self.exp = exp
        self.n_updates_per_break = n_updates_per_break
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.n_min_trials = n_min_trials
        self.trial_start_offset = trial_start_offset
        self.break_start_offset = break_start_offset
        self.break_stop_offset = break_stop_offset
        self.train_param_values = train_param_values
        self.deterministic_training = deterministic_training
        self.add_breaks = add_breaks
        
    def set_predicting_model(self, model):
        """ Needed to keep trained and used params in sync, i.e.
        Update the params of the epo model used for prediction
        with those params of the trained cnt model."""
        self.predicting_model = model
        
    def set_data_processor(self, data_processor):
        self.data_processor = data_processor

    def set_marker_buffer(self, marker_buffer):
        self.marker_buffer = marker_buffer
        
    def initialize(self):
        """ Initialize data containers and theano functions for training."""
        self.rng = RandomState(30948348)
        self.data_batches = []
        self.y_batches = []
        self.input_time_length = get_input_time_length(self.cnt_model)
        self.n_sample_preds = get_n_sample_preds(self.cnt_model)
        self.n_classes = self.cnt_model.output_shape[1]
        # create train function
        log.info("Compile train function...")
        self._create_train_function()
        log.info("Done compiling train function.")
        
    def _create_train_function(self):
        # Maybe replace self.exp.final_layer by self.cnt_model?
        # not clear to me why I am using self.exp.final_layer here 
        targets = T.ivector()
        input_var = get_input_var(self.exp.final_layer)
        updates_expression = FuncAndArgs(adam, learning_rate=self.learning_rate)
        prediction = lasagne.layers.get_output(self.exp.final_layer,
            deterministic=self.deterministic_training, input_var=input_var,
            inputs=input_var)
        # Loss function might need layers or not...
        try:
            loss = self.exp.loss_expression(prediction, targets).mean()
        except TypeError:
            loss = self.exp.loss_expression(prediction, targets,
                self.exp.final_layer).mean()
        # create parameter update expressions
        params = lasagne.layers.get_all_params(self.exp.final_layer,
            trainable=True)
        updates = updates_expression(loss, params)
        if self.exp.updates_modifier is not None:
            # put norm constraints on all layer, for now fixed to max kernel norm
            # 2 and max col norm 0.5
            updates = self.exp.updates_modifier.modify(updates,
                self.exp.final_layer)
            
        # store only the parameters for training,
        # assumes parameters for layers already set
        self.train_params = []
        all_update_params = updates.keys()
        for update_param in all_update_params:
            if update_param not in params:
                self.train_params.append(update_param)
        
        self.train_func = theano.function([input_var, targets], updates=updates)
        
        # Set optimizer/train parameter values if not done
        if self.train_param_values is not None:
            log.info("Setting train parameter values")
            for param, val in zip(self.train_params, self.train_param_values):
                param.set_value(val)
            log.info("...Done setting parameter train values")
        else:
            log.info("Not setting train parameter values, optimization values "
            "start from scratch (model params may be loaded anyways.)")
            
    def add_data_from_today(self, data_processor):
        # Check if old data exists, if yes add it
        now = datetime.datetime.now()
        day_string = now.strftime('%Y-%m-%d')
        data_folder = 'data/online/{:s}'.format(day_string)
        # sort should sort timewise for our timeformat...
        data_files = sorted(glob(os.path.join(data_folder, '*.npy')))
        if len(data_files) > 0:
            log.info("Loading {:d} data files for adaptation:\n{:s}".format(
                len(data_files), str(data_files)))
            for filename in data_files:
                log.info("Add data from {:s}...".format(filename))
                samples_markers = np.load(filename)
                samples = samples_markers[:,:-1]
                markers = np.int32(samples_markers[:,-1])
                self.add_training_blocks_from_old_data(samples, markers,
                    data_processor)
            log.info("Done loading, now have {:d} trials (including breaks)".format(
                len(self.data_batches)))
        else:
            log.info("No data files found to load for adaptation in {:s}".format(
                data_folder))

    def add_training_blocks_from_old_data(self, old_samples,
            old_markers, data_processor):
        # first standardize data
        old_samples = exponential_running_standardize(old_samples, 
            factor_new=data_processor.factor_new, init_block_size=1000, 
            eps=data_processor.eps)
        trial_starts, trial_stops = self.get_trial_start_stop_indices(
                old_markers)
        log.info("Adding {:d} trials".format(len(trial_starts)))
        for trial_start, trial_stop in zip(trial_starts, trial_stops):
            self.add_blocks(trial_start + self.trial_start_offset, 
                trial_stop, old_samples, old_markers)
        # now lets add breaks
        log.info("Adding {:d} breaks".format(len(trial_starts) - 1))
        for break_start, break_stop in zip(trial_stops[:-1], trial_starts[1:]):
            self.add_break(break_start, break_stop, old_samples, old_markers)

    def process_markers(self, markers):
        # Check if a trial has ended with last samples
        # need marker samples with some overlap
        # so we do not miss trial boundaries inbetween two sample blocks
        marker_samples_with_overlap = np.copy(
            self.marker_buffer[-len(markers)-2:])
        trial_has_ended = np.sum(np.diff(marker_samples_with_overlap) < 0) > 0
        if trial_has_ended:
            trial_starts, trial_stops = self.get_trial_start_stop_indices(
                self.marker_buffer)
            trial_start = trial_starts[-1]
            trial_stop = trial_stops[-1]
            log.info("Trial has ended for class {:d}".format(
                self.marker_buffer[trial_start]))
            assert trial_start < trial_stop, ("trial start {:d} should be "
                "before trial stop {:d}, markers: {:s}").format(trial_start, 
                    trial_stop, str(marker_samples_with_overlap))
            self.add_blocks(trial_start + self.trial_start_offset, trial_stop,
                self.data_processor.sample_buffer,
                self.marker_buffer)
            log.info("Now {:d} trials (including breaks)".format(
                len(self.data_batches)))
            
            with log_timing(log, None, final_msg='Time for training:'):
                self.train()
        trial_has_started = np.sum(np.diff(marker_samples_with_overlap) > 0) > 0
        if trial_has_started:
            trial_end_in_marker_buffer = np.sum(np.diff(self.marker_buffer) < 0) > 0
            if trial_end_in_marker_buffer:
                # +1 necessary since diff removes one index
                trial_start = np.flatnonzero(np.diff(self.marker_buffer) > 0)[-1] + 1
                trial_stop = np.flatnonzero(np.diff(self.marker_buffer) < 0)[-1] + 1
                assert trial_start > trial_stop, ("If trial has just started "
                    "expect this to be after stop of last trial")
                self.add_break(break_start=trial_stop, break_stop=trial_start,
                    all_samples=self.data_processor.sample_buffer,
                    all_markers=self.marker_buffer)
            #log.info("Break added, now at {:d} batches".format(len(self.data_batches)))
                
    def add_break(self, break_start, break_stop, all_samples, all_markers):
        if self.add_breaks:
            all_markers = np.copy(all_markers)
            assert np.all(all_markers[break_start:break_stop] == 0)
            assert all_markers[break_start - 1] != 0
            assert all_markers[break_stop] != 0
            # keep n_classes for 1-based matlab indexing logic in markers
            all_markers[break_start:break_stop] = self.n_classes
            self.add_blocks(break_start + self.break_start_offset, 
                break_stop + self.break_stop_offset, all_samples,
                all_markers)
        else:
            pass #Ignore break that was supposed to be added

    def get_trial_start_stop_indices(self, markers):
        # + 1 as diff "removes" one index, i.e. diff will be above zero
            # at the index 1 before the increase=> the trial start
        trial_starts = np.flatnonzero(np.diff(markers) > 0) + 1
        # diff removing index, so this index is last sample of trial
        # but stop indices in python are exclusive so +1
        trial_stops = np.flatnonzero(np.diff(markers) < 0) + 1

        if trial_starts[0] >= trial_stops[0]:
            # cut out first trial which only has end marker
            trial_stops = trial_stops[1:]
        if trial_starts[-1] >= trial_stops[-1]:
            # cut out last trial which only has start marker
            trial_starts = trial_starts[:-1]
        
        assert(len(trial_starts) == len(trial_stops))
        assert(np.all(trial_starts <= trial_stops))
        return trial_starts, trial_stops
    
    def add_blocks(self, trial_start, trial_stop, all_samples, all_markers):
        """Trial start offset as parameter to give different offsets
        for break and normal trial."""
        # n_sample_preds is how many predictions done for
        # one forward pass of the network -> how many crops predicted
        # together in one forward pass for given input time length of 
        # the ConvNet
        # -> crop size is how many samples are needed for one prediction
        crop_size = self.input_time_length - self.n_sample_preds + 1
        if trial_start + self.n_sample_preds > trial_stop:
            log.info("Too little data in this trial to train in it, only "
                "{:d} predictable samples, need atleast {:d}".format(
                     trial_stop - trial_start, self.n_sample_preds))
            return # Too little data in this trial to train on it...
        needed_sample_start = trial_start - crop_size + 1
        # not sure if copy necessary, but why not :)
        needed_samples = np.copy(all_samples[needed_sample_start:trial_stop])
        trial_markers = all_markers[needed_sample_start:trial_stop]
        # trial start can't be at zero atm or else we would have to take more data
        assert (len(np.unique(trial_markers[(crop_size - 1):])) == 1), (
            ("Trial should have exactly one class, markers: {:s} "
                "trial start: {:d}, trial_stop: {:d}").format(
                np.unique(trial_markers[(crop_size - 1):]), # crop_size -1 is index of first prediction
                needed_sample_start, trial_stop))
        self.add_trial_topo_trial_y(needed_samples, trial_markers)
        
    def add_trial_topo_trial_y(self, needed_samples, trial_markers):
        """ needed_samples are samples needed for predicting entire trial,
        i.e. they typically include a part before the first sample of the trial."""
        crop_size = self.input_time_length - self.n_sample_preds + 1
        assert (len(np.unique(trial_markers[(crop_size - 1):])) == 1), (
            ("Trial should have exactly one class, markers: {:s} ").format(
                np.unique(trial_markers[(crop_size - 1):])))
        trial_topo = needed_samples[:,:,np.newaxis,np.newaxis]
        trial_y = np.copy(trial_markers) - 1 # -1 as zero is non-trial marker
        trial_len = len(trial_topo)
        start_end_blocks = get_start_end_blocks_for_trial(crop_size-1,
            trial_len-1, self.input_time_length, self.n_sample_preds)
        assert start_end_blocks[0][0] == 0, "First block should start at first sample"
        batch = create_batch(trial_topo, trial_y, start_end_blocks,
            self.n_sample_preds)
        self.data_batches.append(batch[0])
        self.y_batches.append(batch[1])
        
    def train(self):
        n_trials = len(self.data_batches)
        if n_trials >= self.n_min_trials:
            log.info("Training model...")
            # Remember values as backup in case of NaNs
            model_param_vals_before = lasagne.layers.get_all_param_values(self.exp.final_layer)
            train_param_vals_before = [p.get_value() for p in self.train_params]
            all_blocks = np.concatenate(self.data_batches, axis=0)
            all_y_blocks = np.concatenate(self.y_batches, axis=0)
            # reshape to per block
            # assuming right now targets are simply labels
            # not one-hot encoded
            all_y_blocks = np.reshape(all_y_blocks, (-1, self.n_sample_preds))
            
            # make classes balanced
            # hopefully this is correct?! any sample shd be fine, -10 is a random decision
            labels_per_block = all_y_blocks[:,-10]
            unique_labels = sorted(np.unique(labels_per_block))
            if not np.array_equal(range(len(unique_labels)), 
                unique_labels):
                missing_classes = np.setdiff1d(range(len(unique_labels)),
                    unique_labels)
                log.info(("Do not have labels for all classes yet, "
                    "missing: {:s}, Skipping training...".format(
                        str(missing_classes))))
                return
            class_probs = np.zeros(len(unique_labels))
            for i_class in unique_labels:
                freq = np.mean(labels_per_block == i_class)
                prob = 1.0/ (len(unique_labels) * freq)
                class_probs[i_class] = prob
            block_probs = np.zeros(len(labels_per_block))
            for i_class in unique_labels:
                block_probs[labels_per_block == i_class] = class_probs[i_class]
            block_probs = block_probs / np.sum(block_probs)
            
            assert len(all_blocks) == len(all_y_blocks)
            for _ in xrange(self.n_updates_per_break):
                i_blocks = self.rng.choice(len(all_y_blocks),
                    size=self.batch_size, p=block_probs)
                this_y = np.concatenate(all_y_blocks[i_blocks], axis=0)
                this_topo = all_blocks[i_blocks]
                self.train_func(this_topo, this_y)

            # Check for Nans and if necessary reset to old values
            if np.any([np.any(np.isnan(p.get_value())) for p in self.train_params]):
                log.warn("Reset train parameters due to NaNs")
                for p, old_val in zip(self.train_params, train_param_vals_before):
                    p.set_value(old_val)
            all_layers_trained = lasagne.layers.get_all_layers(self.exp.final_layer)
            if np.any([np.any(np.isnan(p_val))
                    for p_val in lasagne.layers.get_all_param_values(all_layers_trained)]):
                log.warn("Reset model params due to NaNs")
                lasagne.layers.set_all_param_values(self.exp.final_layer, model_param_vals_before)
            assert not np.any([np.any(np.isnan(p.get_value())) for p in self.train_params])
            assert not np.any([np.any(np.isnan(p_val))
                    for p_val in lasagne.layers.get_all_param_values(all_layers_trained)])
            
            # Copy over new values to model used for prediction
            all_layers_used = lasagne.layers.get_all_layers(self.predicting_model)
            lasagne.layers.set_all_param_values(all_layers_used,
                lasagne.layers.get_all_param_values(all_layers_trained))
        else:
            log.info("Not training model yet, only have {:d} of {:d} trials ".format(
                n_trials, self.n_min_trials))
Example #57
0
                   for _ in range(params["NUM_FEATURES"])]

learner = Learner(features)

sum_of_errors = 0.0

mse_history = np.zeros(10000)
ema_history = np.zeros(10000)
exponential_moving_avg = 0.0

for iteration in range(params["NUM_ITERATIONS"]) :

    if params["CONTINUOUS_INPUTS"] :
        random_input = rng.rand(params["NUM_INPUT_VARS"])
    else :
        random_input = rng.choice([0,1],  params["NUM_INPUT_VARS"])

    if params["NON_STATIONARY"] :
        target_val = (target_functions[(iteration / 100000) % 2
                                      ].get_output(random_input) + 
                                            rng.normal(0,1))
    else :
        target_val = (target_functions[0].get_output(random_input) + 
                                            rng.normal(0,1))
    
    mse = learner.train(random_input, target_val, params["BASE_LEARNING_RATE"], 
                        iteration)
    if iteration == 0 :
        exponential_moving_avg = mse
    else :
        exponential_moving_avg = exponential_moving_avg * 0.999 + mse*0.001
Example #58
0
            dress[f] = ''
    return dress

data0_dress = utils_local.load_data0(fname='../../DATASETS/dress_attributes/data/json/data0.json')
#i = 239 #1224  #357



data = {}
data['items'] = []  # a list of dictionaries


N = len(data0_dress['dresses'])  # number of dresses


test_val_split = prng.choice(N, 5000, replace=False)  # randomly choose 2000 imgid for test and validations
test_split = prng.choice(test_val_split, 1000, replace=False)  # randomly choose 1000 for test. Rest is for validation






# # sample from a bernoulli distribution N times
# # toss a coin N times with prob. p of getting heads (1)
# N = len(data0['dresess'])  # number of dresses
# p = 0.8  # with probability p a dress is assigned to the train split
# split = np.random.binomial(1, p, N)  # bernoulli is a binomial with only 1 trial, thus 1.
#
# # Make sure that we have at least 80% for training
# while sum(split) < (p * N):
Example #59
0
def balanced_train_test_split(X, y, test_size=None, train_size=None, bootstrap=False,
                              random_state=None):
    """ Split the data into a balanced training set and test set of some given size.

        For a dataset with an unequal numer of samples in each class, one useful procedure
        is to split the data into a training and a test set in such a way that the classes
        are balanced.
        
        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            Feature matrix.
        
        y : array, shape = [n_features]
            Target vector.

        test_size : float or int (default=0.3)
            If float, should be between 0.0 and 1.0 and represent the proportion of the dataset
            to include in the test split. If int, represents the absolute number of test samples.
            If None, the value is automatically set to the complement of the train size.
            If train size is also None, test size is set to 0.3.
        
        train_size : float or int (default=1-test_size)
            If float, should be between 0.0 and 1.0 and represent the proportion of the dataset
            to include in the train split. If int, represents the absolute number of train samples.
            If None, the value is automatically set to the complement of the test size.
            
        random_state : int, optional (default=None)
            Pseudo-random number generator state used for random sampling.
        
        Returns
        -------
        X_train : array
            The feature vectors (stored as columns) in the training set.
            
        X_test : array
            The feature vectors (stored as columns) in the test set.
            
        y_train : array
            The target vector in the training set.
            
        y_test : array
            The target vector in the test set.
    """
    
    # initialise the random number generator
    rng = RandomState(random_state)

    # make sure X and y are numpy arrays
    X = np.asarray(X)
    y = np.asarray(y)
    
    # get information about the class distribution
    classes, y_indices = np.unique(y, return_inverse=True)
    n_classes = len(classes)
    cls_count = np.bincount(y_indices)

    # get the training and test size
    train_size, test_size = _get_train_test_size(train_size, test_size, len(y))

    # number of samples in each class that is included in the training and test set
    n_train = np.round(train_size / n_classes).astype(int)
    n_test = np.round(test_size / n_classes).astype(int)
    n_total = n_train + n_test
    
    # make sure we have enough samples to create a balanced split
    min_count = min(cls_count)
    if min_count < (n_train + n_test) and not bootstrap:
        raise ValueError('The smallest class contains {} examples, which is not '
                         'enough to create a balanced split. Choose a smaller size '
                         'or enable bootstraping.'.format(min_count))
    
    # selected indices are stored here
    train = []
    test = []
    
    # get the desired sample from each class
    for i, cls in enumerate(classes):
        if bootstrap:
            shuffled = rng.choice(cls_count[i], n_total, replace=True)
        else:
            shuffled = rng.permutation(cls_count[i])
        
        cls_i = np.where(y == cls)[0][shuffled]
        train.extend(cls_i[:n_train])
        test.extend(cls_i[n_train:n_total])
        
    train = list(rng.permutation(train))
    test = list(rng.permutation(test))
    
    return X[train], X[test], y[train], y[test]
Example #60
0
content_draw = get_content_generator(rs, zipf_param, contentHistory, 5000000, 1, content_duration)

# contentHistory = get_updated_history(contentHistory, content)

# update_vcdn_storage(g, contentHistory)

# winner, price = create_content_delivery(g=g, peers=servers, content=content,consumer=consumer)

env = simpy.Environment()
the_time = 30

ticker = get_ticker(rs, poisson_param, )

while the_time < max_time_experiment:
    location = rs.choice(consumers)
    the_time = ticker() + the_time
    User(g, {"CDN": cdns, "VCDN": vcdns, "MUCDN": mucdns}, env, location, the_time, content_draw)

for vcdn in vcdns:
    TE(rs, env, vcdn, g, contentHistory, refresh_delay=vcdn_refresh_delay, download_delay=vcdn_download_delay,
       concurent_download=vcdn_concurent_download)

for mucdn in mucdns:
    TE(rs, env, mucdn, g, contentHistory, refresh_delay=mucdn_refresh_delay, download_delay=mucdn_download_delay,
       concurent_download=mucdn_concurent_download)


def capacity_vcdn_monitor():
    while True:
        yield env.timeout(11)