def data_sampling(data,num_of_samples):
    random = check_random_state(seed=None)
    n_samples, n_features = data.shape

    if(num_of_samples>n_samples):
        indexes = np.concatenate((sample_without_replacement(n_samples, n_samples, random_state=random), random.randint(0, n_samples, num_of_samples-n_samples)), axis=None)
    else:
        indexes = sample_without_replacement(n_samples, num_of_samples, random_state=random)

    return data.loc[indexes]
Beispiel #2
0
    def mutation(self, pm):
        n_mutations_in_features = self.random_state.binomial(self.genome_features.shape[0], pm)
        n_mutations_in_samples = self.random_state.binomial(self.genome_samples.shape[0], pm)

        mutated_samples = sample_without_replacement(
            self.genome_samples.shape[0], n_mutations_in_samples, random_state=self.random_state
        )
        mutated_features = sample_without_replacement(
            self.genome_features.shape[0], n_mutations_in_features, random_state=self.random_state
        )
        self.genome_samples[mutated_samples] = ~self.genome_samples[mutated_samples]
        self.genome_features[mutated_features] = ~self.genome_features[mutated_features]
Beispiel #3
0
def data_sampling(data, replacement,num_of_samples):
    random = check_random_state(seed=None)
    n_samples, n_features = data.shape

    if replacement:
        indexes = random.randint(0, n_samples, num_of_samples)
    else:
        if(num_of_samples>n_samples):
            indexes = sample_without_replacement(n_samples, n_samples, random_state=random)
        else:
            indexes = sample_without_replacement(n_samples, num_of_samples, random_state=random)

    return data.loc[indexes]
Beispiel #4
0
def generate_indices(random_state, bootstrap, n_population, n_samples):
    """ Draw randomly sampled indices. Internal use only.

    See sklearn/ensemble/bagging.py

    Parameters
    ----------
    random_state : RandomState
        A random number generator instance to define the state of the random
        permutations generator.

    bootstrap :  bool
        Specifies whether to bootstrap indice generation

    n_population : int
        Specifies the population size when generating indices

    n_samples : int
        Specifies number of samples to draw

    Returns
    -------
    indices : numpy array, shape (n_samples,)
        randomly drawn indices
    """

    # Draw sample indices
    if bootstrap:
        indices = random_state.randint(0, n_population, n_samples)
    else:
        indices = sample_without_replacement(n_population, n_samples,
                                             random_state=random_state)

    return indices
Beispiel #5
0
    def __init__(self, n_samples, n_features, ps, pf, random_state):
        self.random_state = random_state
        self.genome_features = np.zeros((n_features,), dtype=np.bool8)
        self.genome_samples = np.zeros((n_samples,), dtype=np.bool8)

        n_pick_samples = np.floor(n_samples * ps)
        n_pick_features = np.floor(n_features * pf)

        picked_samples = sample_without_replacement(n_samples, n_pick_samples, random_state=random_state)
        picked_features = sample_without_replacement(n_features, n_pick_features, random_state=random_state)
        self.genome_samples[picked_samples] = True
        self.genome_features[picked_features] = True

        self.cache_est_weight = 1
        self.cache_contribution = 0
        self.cache_predictions = None
Beispiel #6
0
def check_sample_int_distribution(sample_without_replacement):
    # This test is heavily inspired from test_random.py of python-core.
    #
    # For the entire allowable range of 0 <= k <= N, validate that
    # sample generates all possible permutations
    n_population = 10

    # a large number of trials prevents false negatives without slowing normal
    # case
    n_trials = 10000

    for n_samples in range(n_population):
        # Counting the number of combinations is not as good as counting the
        # the number of permutations. However, it works with sampling algorithm
        # that does not provide a random permutation of the subset of integer.
        n_expected = combinations(n_population, n_samples, exact=True)

        output = {}
        for i in range(n_trials):
            output[frozenset(sample_without_replacement(n_population,
                                                        n_samples))] = None

            if len(output) == n_expected:
                break
        else:
            raise AssertionError(
                "number of combinations != number of expected (%s != %s)" %
                (len(output), n_expected))
Beispiel #7
0
def run_test(X, Y, A, B, Sigma=None, proj=None, n_combinations=50000):
    X, Y, A, B = normalize_list([X, Y, A, B], Sigma=Sigma, proj=proj)
    if Sigma is not None:
        A = np.matmul(A, Sigma)
        B = np.matmul(B, Sigma)

    base_statistics = statistics(X, Y, A, B)

    union_XY = np.vstack((X, Y))
    xy_size = union_XY.shape[0]
    x_size = X.shape[0]
    count = 0

    all_idx = set(range(xy_size))

    if comb(xy_size, x_size) > n_combinations:
        for _ in range(n_combinations):
            group_1_idx = sample_without_replacement(xy_size, x_size)
            group_2_idx = list(all_idx.difference(group_1_idx))
            sample_stat = statistics(union_XY[group_1_idx],
                                     union_XY[group_2_idx], A, B)
            count += sample_stat > base_statistics
    else:
        for group_1_idx in combinations(range(xy_size), x_size):
            group_2_idx = list(all_idx.difference(group_1_idx))
            sample_stat = statistics(union_XY[list(group_1_idx)],
                                     union_XY[group_2_idx], A, B)
            count += sample_stat > base_statistics

    p_val = count / n_combinations
    effect_val = effect_size(X, Y, A, B)

    print('P-val is %f; effect size is %f' % (p_val, effect_val))
    return p_val, effect_val
Beispiel #8
0
def check_sample_int(sample_without_replacement):
    # This test is heavily inspired from test_random.py of python-core.
    #
    # For the entire allowable range of 0 <= k <= N, validate that
    # the sample is of the correct length and contains only unique items
    n_population = 100

    for n_samples in range(n_population + 1):
        s = sample_without_replacement(n_population, n_samples)
        assert len(s) == n_samples
        unique = np.unique(s)
        assert np.size(unique) == n_samples
        assert np.all(unique < n_population)

    # test edge case n_population == n_samples == 0
    assert np.size(sample_without_replacement(0, 0)) == 0
Beispiel #9
0
def check_sample_int_distribution(sample_without_replacement):
    # This test is heavily inspired from test_random.py of python-core.
    #
    # For the entire allowable range of 0 <= k <= N, validate that
    # sample generates all possible permutations
    n_population = 10

    # a large number of trials prevents false negatives without slowing normal
    # case
    n_trials = 10000

    for n_samples in range(n_population):
        # Counting the number of combinations is not as good as counting the
        # the number of permutations. However, it works with sampling algorithm
        # that does not provide a random permutation of the subset of integer.
        n_expected = comb(n_population, n_samples, exact=True)

        output = {}
        for i in range(n_trials):
            output[frozenset(
                sample_without_replacement(n_population, n_samples))] = None

            if len(output) == n_expected:
                break
        else:
            raise AssertionError(
                "number of combinations != number of expected (%s != %s)" %
                (len(output), n_expected))
Beispiel #10
0
 def sample_without_replacement_method(n_population,
                                       n_samples,
                                       random_state=None):
     return sample_without_replacement(n_population,
                                       n_samples,
                                       method=m,
                                       random_state=random_state)
Beispiel #11
0
def generate_indices(random_state, bootstrap, n_population, n_samples):
    """ Draw randomly sampled indices. Internal use only.

    See sklearn/ensemble/bagging.py

    Parameters
    ----------
    random_state : RandomState
        A random number generator instance to define the state of the random
        permutations generator.

    bootstrap :  bool
        Specifies whether to bootstrap indice generation

    n_population : int
        Specifies the population size when generating indices

    n_samples : int
        Specifies number of samples to draw

    Returns
    -------
    indices : numpy array, shape (n_samples,)
        randomly drawn indices
    """

    # Draw sample indices
    if bootstrap:
        indices = random_state.randint(0, n_population, n_samples)
    else:
        indices = sample_without_replacement(n_population,
                                             n_samples,
                                             random_state=random_state)

    return indices
Beispiel #12
0
def check_sample_int(sample_without_replacement):
    # This test is heavily inspired from test_random.py of python-core.
    #
    # For the entire allowable range of 0 <= k <= N, validate that
    # the sample is of the correct length and contains only unique items
    n_population = 100

    for n_samples in range(n_population + 1):
        s = sample_without_replacement(n_population, n_samples)
        assert_equal(len(s), n_samples)
        unique = np.unique(s)
        assert_equal(np.size(unique), n_samples)
        assert_true(np.all(unique < n_population))

    # test edge case n_population == n_samples == 0
    assert_equal(np.size(sample_without_replacement(0, 0)), 0)
    def __iter__(self):
        # check if all distributions are given as lists
        # in this case we want to sample without replacement
        all_lists = np.all([not hasattr(v, "rvs")
                            for v in self.param_distributions.values()])
        rnd = check_random_state(self.random_state)

        if all_lists:
            # look up sampled parameter settings in parameter grid
            param_grid = ParameterGrid(self.param_distributions)
            grid_size = len(param_grid)

            if grid_size < self.n_iter:
                raise ValueError(
                    "The total space of parameters %d is smaller "
                    "than n_iter=%d." % (grid_size, self.n_iter)
                    + " For exhaustive searches, use GridSearchCV.")
            for i in sample_without_replacement(grid_size, self.n_iter,
                                                random_state=rnd):
                yield param_grid[i]

        else:
            # Always sort the keys of a dictionary, for reproducibility
            items = sorted(self.param_distributions.items())
            for _ in six.moves.range(self.n_iter):
                params = dict()
                for k, v in items:
                    if hasattr(v, "rvs"):
                        if sp_version < (0, 16):
                            params[k] = v.rvs()
                        else:
                            params[k] = v.rvs(random_state=rnd)
                    else:
                        params[k] = v[rnd.randint(len(v))]
                yield params
def function4(percent, targetX):                                                     #this function make noisy to given percentage
    percent = percent/100
    sample = sample_without_replacement(len(targetX),percent*len(targetX))
    change = random.sample(range(0,64),10)
    for i in change:
        targetX[sample][i] = abs(targetX[sample][i]-16)
    
    return targetX
def _generate_indices(random_state, bootstrap, n_population, n_samples):
    if bootstrap:
        indices = random_state.randint(0, n_population, n_samples)
    else:
        indices = sample_without_replacement(n_population,
                                             n_samples,
                                             random_state=random_state)
    return indices
Beispiel #16
0
    def fit(self, X, y=None):
        X = check_array(X)
        n_samples, n_features = X.shape

        random_state = check_random_state(self.random_state)
        self.components = sample_without_replacement(
            n_features, self.n_components, random_state=random_state)
        return self
Beispiel #17
0
def check_edge_case_of_sample_int(sample_without_replacement):
    # n_population < n_sample
    assert_raises(ValueError, sample_without_replacement, 0, 1)
    assert_raises(ValueError, sample_without_replacement, 1, 2)

    # n_population == n_samples
    assert_equal(sample_without_replacement(0, 0).shape, (0, ))

    assert_equal(sample_without_replacement(1, 1).shape, (1, ))

    # n_population >= n_samples
    assert_equal(sample_without_replacement(5, 0).shape, (0, ))
    assert_equal(sample_without_replacement(5, 1).shape, (1, ))

    # n_population < 0 or n_samples < 0
    assert_raises(ValueError, sample_without_replacement, -1, 5)
    assert_raises(ValueError, sample_without_replacement, 5, -1)
Beispiel #18
0
def check_edge_case_of_sample_int(sample_without_replacement):

    # n_poluation < n_sample
    assert_raises(ValueError, sample_without_replacement, 0, 1)
    assert_raises(ValueError, sample_without_replacement, 1, 2)

    # n_population == n_samples
    assert_equal(sample_without_replacement(0, 0).shape, (0, ))

    assert_equal(sample_without_replacement(1, 1).shape, (1, ))

    # n_population >= n_samples
    assert_equal(sample_without_replacement(5, 0).shape, (0, ))
    assert_equal(sample_without_replacement(5, 1).shape, (1, ))

    # n_population < 0 or n_samples < 0
    assert_raises(ValueError, sample_without_replacement, -1, 5)
    assert_raises(ValueError, sample_without_replacement, 5, -1)
Beispiel #19
0
def _minify_dataset(ratings_df: pd.DataFrame,
                    random_state: int) -> pd.DataFrame:
    users_count = len(ratings_df['user_id'].unique())
    samples = 200 if 200 < users_count else users_count / 2
    users_subset = set(
        sample_without_replacement(users_count,
                                   samples,
                                   random_state=random_state))
    return ratings_df[ratings_df['user_id'].isin(users_subset)]
Beispiel #20
0
def get_random_gene_df(gene_df, n_genes, label_col="type"):
    labels = gene_df.loc[:, label_col]
    unlab_df = gene_df.drop(label_col, axis=1)
    index_set = sample_without_replacement(gene_df.shape[1], n_genes)
    gene_arr_all = gene_df.columns
    gene_arr_rand = gene_arr_all[index_set]
    gene_df_rand = gene_df[gene_arr_rand]
    gene_df_rand["type"] = labels
    return gene_df_rand
def subsampled_hadamard_matrix(n_components, n_features, random_state=None):
    """Sub-sampled hadamard matrix to have shape n_components and n_features

    A hadamard matrix of shape at (least n_components, n_features) is
    subsampled without replacement.

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    random_state : int, RandomState instance or None (default=None)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    Returns
    -------
    components : numpy array of shape [n_components, n_features]
        The generated random matrix.

    """
    if n_components <= 0:
        raise ValueError("n_components must be strictly positive, got %d" %
                         n_components)
    if n_features <= 0:
        raise ValueError("n_features must be strictly positive, got %d" %
                         n_components)

    random_state = check_random_state(random_state)
    n_hadmard_size = max(2**np.ceil(np.log2(x))
                         for x in (n_components, n_features))

    row = sample_without_replacement(n_hadmard_size,
                                     n_components,
                                     random_state=random_state)
    col = sample_without_replacement(n_hadmard_size,
                                     n_features,
                                     random_state=random_state)
    hadamard_matrix = sp_hadamard(n_hadmard_size, dtype=np.float)[row][:, col]
    hadamard_matrix *= 1 / np.sqrt(n_components)
    return hadamard_matrix
Beispiel #22
0
    def observer(state):
        """Observe simulation state with uncertainty -> approximate state."""

        # First average over cells to get non-spatial distribution
        ncells = int(np.round(len(state) / 15, 0))
        state_by_cell = np.reshape(state, (int(ncells), 15))

        # Species proportions
        species_by_cell = np.array([
            np.sum(state_by_cell[:, 0:3], axis=1),
            np.sum(state_by_cell[:, 3:6], axis=1),
            np.sum(state_by_cell[:, 6:9], axis=1),
            np.sum(state_by_cell[:, 9:12], axis=1),
            np.sum(state_by_cell[:, 12:14], axis=1), state_by_cell[:, 14]
        ]).T

        # Add empty space to be sampled also
        space = np.array(1.0 - np.sum(state_by_cell, axis=1)).reshape((400, 1))
        cell_props = np.append(species_by_cell, space, axis=1)

        # Create population of hosts and sample appropriately
        population = np.array(pop_size * cell_props)
        obs_states = []
        for i in range(ncells):
            sample = sample_without_replacement(pop_size, n_samples)
            bins = np.append([0.0], np.cumsum(population[i]))
            observed_species = np.histogram(sample, bins)[0]

            observed_state = np.zeros(15)

            # Tanoak
            for j in range(4):
                idcs = ((3 * j), (3 * j + 3))
                inf_probs = state_by_cell[i, idcs[0]:idcs[1]] / np.sum(
                    state_by_cell[i, idcs[0]:idcs[1]])
                inf_sample = np.random.choice(3,
                                              observed_species[j],
                                              p=inf_probs)
                observed_state[idcs[0]:idcs[1]] = np.histogram(
                    inf_sample, range(4))[0]

            # Bay
            inf_probs = state_by_cell[i, 12:14] / np.sum(state_by_cell[i,
                                                                       12:14])
            inf_sample = np.random.choice(2, observed_species[4], p=inf_probs)
            observed_state[12:14] = np.histogram(inf_sample, range(3))[0]

            # Redwood
            observed_state[14] = observed_species[5]

            obs_states.append(observed_state)

        obs_states = np.array(obs_states)
        obs_state = (np.sum(obs_states, axis=0) / (n_samples * ncells))

        return obs_state
def bootstrap_generator(n_bootstrap_iterations, sample_fraction, X, random_state=None):
    """Generates bootstrap samples from dataset."""
    if random_state is not None:
        np.random.seed(random_state)
        random.seed(random_state)
    n_samples = len(X)
    n_subsamples = np.floor(sample_fraction * n_samples).astype(int)
    for _ in range(n_bootstrap_iterations):
        subsample = sample_without_replacement(n_samples, n_subsamples)
        yield subsample
def subsampled_hadamard_matrix(n_components, n_features, random_state=None):
    """Sub-sampled hadamard matrix to have shape n_components and n_features

    A hadamard matrix of shape at (least n_components, n_features) is
    subsampled without replacement.

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    random_state : int, RandomState instance or None (default=None)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    Returns
    -------
    components : numpy array of shape [n_components, n_features]
        The generated random matrix.

    """
    if n_components <= 0:
        raise ValueError("n_components must be strictly positive, got %d" %
                         n_components)
    if n_features <= 0:
        raise ValueError("n_features must be strictly positive, got %d" %
                         n_components)

    random_state = check_random_state(random_state)
    n_hadmard_size = max(2 ** np.ceil(np.log2(x))
                         for x in (n_components, n_features))

    row = sample_without_replacement(n_hadmard_size, n_components,
                                     random_state=random_state)
    col = sample_without_replacement(n_hadmard_size, n_features,
                                     random_state=random_state)
    hadamard_matrix = sp_hadamard(n_hadmard_size, dtype=np.float)[row][:, col]
    hadamard_matrix *= 1 / np.sqrt(n_components)
    return hadamard_matrix
Beispiel #25
0
    def decoderaccuracy_wtih_numcells(self, x, y, iterations, task,
                                      classifier_type):
        numcells = np.size(x, 1)
        percsamples = [1, 5, 10, 20, 50, 80, 100]
        numsamples = [np.int(numcells * (p / 100)) for p in percsamples]
        numcells_dataframe = pd.DataFrame(
            columns=['SampleSize', 'Split', 'R2', 'rho', 'score', 'errorprob'])
        k = KFold(n_splits=numcell_kfold_splits,
                  random_state=None,
                  shuffle=False)
        for n, ns in enumerate(numsamples):
            print(f'Fitting on %d neurons' % ns)
            for i in np.arange(iterations):
                cells = sample_without_replacement(numcells, ns)
                x_resample = x[:, cells]
                count_cv = 1
                # Also do k-fold validation for these iterations
                for train_index, test_index in k.split(x_resample):
                    # print(f'Validation %d' % count_cv)
                    # Split data
                    x_rs_train, x_rs_test = x_resample[
                        train_index], x_resample[test_index]
                    y_rs_train, y_rs_test = y[train_index], y[test_index]

                    nbpfmodel = self.fit_SVM(x_rs_train,
                                             y_rs_train,
                                             classifier_type=classifier_type)
                    scores, prediction, probability = self.validate_model(
                        classifier_type=classifier_type,
                        model=nbpfmodel,
                        x_test=x_rs_test,
                        y_test=y_rs_test,
                        task=task,
                        plotflag=plot_numcells)
                    backend.clear_session()

                    R2 = CommonFunctions.get_R2(y_actual=y_rs_test,
                                                y_predicted=prediction)
                    rho = CommonFunctions.get_R2(y_actual=y_rs_test,
                                                 y_predicted=prediction)

                    numcells_dataframe = numcells_dataframe.append(
                        {
                            'SampleSize': f'%d%%' % percsamples[n],
                            'Split': count_cv,
                            'R2': R2,
                            'rho': rho,
                            'score': scores,
                            'errorprob': probability
                        },
                        ignore_index=True)
                    count_cv += 1

        return numcells_dataframe
Beispiel #26
0
def _generate_ts_indices(random_state, bootstrap, n_population, block_size):
    """Draw randomly sampled indices."""
    # Draw sample indices
    if bootstrap:
        indices = mb_bootstrap_indicies(n_population, block_size)
    else:
        # FIXME: block bootstrap without replacement
        indices = sample_without_replacement(
            n_population, n_samples, random_state=random_state
        )

    return indices
Beispiel #27
0
def data_pseudo_labeling(unlabeled_dataset, neigh, keywords):
    from sklearn.utils.random import sample_without_replacement
    index_pseudo_data = sample_without_replacement(len(unlabeled_dataset),
                                                   seudo_labeled_data_size)
    pseudo_data = unlabeled_dataset.iloc[sorted(index_pseudo_data)]
    pseudo_predict = neigh.predict(pseudo_data[keywords])
    pseudo_data.insert(0, 'Class', pseudo_predict)
    selected_columns = keywords.insert(0, 'Patent_Number')
    selected_columns = selected_columns.insert(0, 'Class')
    pseudo_labeled_data = pseudo_data.filter(selected_columns)

    return pseudo_labeled_data
Beispiel #28
0
def _generate_random_features(random_state, bootstrap, n_population,
                              n_samples):
    """Draw randomly sampled indices."""
    # Draw sample indices
    if bootstrap:
        indices = random_state.randint(0, n_population, n_samples)
    else:
        indices = sample_without_replacement(n_population,
                                             n_samples,
                                             random_state=random_state)

    return indices
def reportAccuracyVersusSparsityOfInput(df, normalized_df, model, labeled_tags,
                                        train_index, test_index, percent):
    N_train = train_index.shape[0]
    train_subset_index = sample_without_replacement(N_train,
                                                    N_train * percent / 100)
    subset_train_index = train_index[train_subset_index]
    test_index_updated = set(train_index).union(set(test_index)).difference(
        set(subset_train_index))
    accuracy, c_matrix, ra_score = testClassification(df, normalized_df, model,
                                                      labeled_tags,
                                                      list(subset_train_index),
                                                      list(test_index_updated))
    return accuracy, ra_score
Beispiel #30
0
def _generate_hypercube(samples, dimensions, rng):
    """Returns distinct binary samples of length dimensions
    """
    if dimensions > 30:
        return np.hstack([
            rng.randint(2, size=(samples, dimensions - 30)),
            _generate_hypercube(samples, 30, rng)
        ])
    out = sample_without_replacement(2**dimensions, samples,
                                     random_state=rng).astype(dtype='>u4',
                                                              copy=False)
    out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:]
    return out
Beispiel #31
0
def _generate_indices(random_state, bootstrap, n_population, n_samples):
    """Draw randomly sampled indices. Internal use only.
    See sklearn/ensemble/bagging.py
    """
    # Draw sample indices
    if bootstrap:
        indices = random_state.randint(0, n_population, n_samples)
    else:
        indices = sample_without_replacement(n_population,
                                             n_samples,
                                             random_state=random_state)

    return indices
Beispiel #32
0
def bootstrap_generator(n_bootstrap_iterations,
                        sample_fraction,
                        X,
                        random_state=None):
    """Generates bootstrap samples from dataset."""
    n_samples = len(X)
    n_subsamples = np.floor(sample_fraction * n_samples).astype(int)
    subsamples = []
    for _ in range(n_bootstrap_iterations):
        subsample = sample_without_replacement(n_samples,
                                               n_subsamples,
                                               random_state=None)
        subsamples.append(subsample)
    return subsamples
Beispiel #33
0
def test_sample_without_replacement_algorithms():
    methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")

    for m in methods:
        sample_without_replacement_method = \
            lambda n_population, n_samples, random_state=None: \
                sample_without_replacement(n_population,
                                           n_samples,
                                           method=m,
                                           random_state=random_state)

        check_edge_case_of_sample_int(sample_without_replacement_method)
        check_sample_int(sample_without_replacement_method)
        check_sample_int_distribution(sample_without_replacement_method)
def subsampled_identity_matrix(n_components,
                               n_features,
                               random_state=None,
                               with_replacement=True):
    """Sub-sampled identity matrix to have shape n_components and n_features

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    random_state : int, RandomState instance or None (default=None)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    with_replacement : bool,
        Whether or not drawing components with replacements.

    Returns
    -------
    components : numpy array of shape [n_components, n_features]
        The generated random matrix.

    """

    if n_components <= 0:
        raise ValueError("n_components must be strictly positive, got %d" %
                         n_components)
    if n_features <= 0:
        raise ValueError("n_features must be strictly positive, got %d" %
                         n_components)

    rng = check_random_state(random_state)

    components = sparse.dia_matrix((np.ones(n_features), [0]),
                                   shape=(n_features, n_features)).tocsr()
    if with_replacement:
        mask = rng.randint(n_features, size=(n_components, ))

    else:
        mask = sample_without_replacement(n_features,
                                          n_components,
                                          random_state=rng)

    components = components[mask]
    return components * np.sqrt(1.0 * n_features / n_components)
Beispiel #35
0
    def get_all_indices(self,
                        n_samples=None,
                        max_samples=None,
                        random_state=None):
        """Get the indices on which to evaluate the fitness of a program.

        Parameters
        ----------
        n_samples : int
            The number of samples.

        max_samples : int
            The maximum number of samples to use.

        random_state : RandomState instance
            The random number generator.

        Returns
        -------
        indices : array-like, shape = [n_samples]
            The in-sample indices.

        not_indices : array-like, shape = [n_samples]
            The out-of-sample indices.

        """
        if self._indices_state is None and random_state is None:
            raise ValueError('The program has not been evaluated for fitness '
                             'yet, indices not available.')

        if n_samples is not None and self._n_samples is None:
            self._n_samples = n_samples
        if max_samples is not None and self._max_samples is None:
            self._max_samples = max_samples
        if random_state is not None and self._indices_state is None:
            self._indices_state = random_state.get_state()

        indices_state = check_random_state(None)
        indices_state.set_state(self._indices_state)

        not_indices = sample_without_replacement(self._n_samples,
                                                 self._n_samples -
                                                 self._max_samples,
                                                 random_state=indices_state)
        sample_counts = np.bincount(not_indices, minlength=self._n_samples)
        indices = np.where(sample_counts == 0)[0]

        return indices, not_indices
def make_train_validation_test_triplets_list(triplet_file, random_seed=None):

    random.seed(random_seed)

    triplets = np.loadtxt(triplet_file)

    # sample part of the triplets
    n_triplets = len(triplets)
    triplets = triplets[sample_without_replacement(n_population=n_triplets,
                                                   n_samples=40000)]

    train_triplets_file = "./train_triplets_list.txt"
    validation_triplets_file = "./validation_triplets_list.txt"
    test_triplets_file = "./test_triplets_list.txt"

    if os.path.exists(train_triplets_file) and os.path.exists(
            validation_triplets_file) and os.path.exists(test_triplets_file):
        triplets_train = np.loadtxt(train_triplets_file)
        triplets_validation = np.loadtxt(validation_triplets_file)
        triplets_test = np.loadtxt(test_triplets_file)

    else:
        train_images = random.sample(range(0, 5000),
                                     3600)  #list(range(0, 3800))

        triplets_train = [
            t for t in triplets
            if (t[0] in train_images and t[1] in train_images
                and t[2] in train_images)
        ]
        triplets_vt = [
            t for t in triplets
            if (t[0] not in train_images and t[1] not in train_images
                and t[2] not in train_images)
        ]

        triplets_validation, triplets_test = train_test_split(triplets_vt,
                                                              train_size=0.5)

        np.savetxt(train_triplets_file, triplets_train)
        np.savetxt(validation_triplets_file, triplets_validation)
        np.savetxt(test_triplets_file, triplets_test)

    print("Train dataset size:      %d" % (len(triplets_train)))
    print("Validation dataset size: %d" % (len(triplets_validation)))
    print("Test dataset size:       %d" % (len(triplets_test)))

    return triplets_train, triplets_validation, triplets_test
def latin_hypercube_sampling(bounds, pop):
    """Latin Hypercube sampling to generate more uniformly distributed differential evolution initial parameters values.

	Parameters
	----------
	bounds : np.array
		Bounds to generate parameters within, should be of shape (nb of parameters, 2)
	pop : int
		Number of sets of inital parameters to generate
	"""
    ranges = np.linspace(bounds[:, 0], bounds[:, 1], pop + 1).T
    ranges = np.array([ranges[:, :-1], ranges[:, 1:]]).T
    cs = np.random.uniform(low=ranges[:, :, 0], high=ranges[:, :, 1])
    a = sample_without_replacement(pop**len(bounds), pop)
    a = np.array(np.unravel_index(a, [pop] * len(bounds)))
    return np.array([cs[a[i], i] for i in range(len(bounds))]).T
def subsampled_identity_matrix(n_components, n_features, random_state=None,
                               with_replacement=True):
    """Sub-sampled identity matrix to have shape n_components and n_features

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    random_state : int, RandomState instance or None (default=None)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    with_replacement : bool,
        Whether or not drawing components with replacements.

    Returns
    -------
    components : numpy array of shape [n_components, n_features]
        The generated random matrix.

    """

    if n_components <= 0:
        raise ValueError("n_components must be strictly positive, got %d" %
                         n_components)
    if n_features <= 0:
        raise ValueError("n_features must be strictly positive, got %d" %
                         n_components)

    rng = check_random_state(random_state)

    components = sparse.dia_matrix((np.ones(n_features), [0]),
                                   shape=(n_features, n_features)).tocsr()
    if with_replacement:
        mask = rng.randint(n_features, size=(n_components,))

    else:
        mask = sample_without_replacement(n_features, n_components,
                                          random_state=rng)

    components = components[mask]
    return components * np.sqrt(1.0 * n_features / n_components)
Beispiel #39
0
    def get_all_indices(self, n_samples=None, max_samples=None,
                        random_state=None):
        """Get the indices on which to evaluate the fitness of a program.

        Parameters
        ----------
        n_samples : int
            The number of samples.

        max_samples : int
            The maximum number of samples to use.

        random_state : RandomState instance
            The random number generator.

        Returns
        -------
        indices : array-like, shape = [n_samples]
            The in-sample indices.

        not_indices : array-like, shape = [n_samples]
            The out-of-sample indices.
        """
        if self._indices_state is None and random_state is None:
            raise ValueError('The program has not been evaluated for fitness '
                             'yet, indices not available.')

        if n_samples is not None and self._n_samples is None:
            self._n_samples = n_samples
        if max_samples is not None and self._max_samples is None:
            self._max_samples = max_samples
        if random_state is not None and self._indices_state is None:
            self._indices_state = random_state.get_state()

        indices_state = check_random_state(None)
        indices_state.set_state(self._indices_state)

        not_indices = sample_without_replacement(
            self._n_samples,
            self._n_samples - self._max_samples,
            random_state=indices_state)
        sample_counts = np.bincount(not_indices, minlength=self._n_samples)
        indices = np.where(sample_counts == 0)[0]

        return indices, not_indices
Beispiel #40
0
    def fit(self, X, y, random_state=None):
        """
        Train ENOLS on the given training set.

        Parameters
        ----------
        X: an input array of shape (n_sample, n_features)
        y: an array of shape (n_sample,) containing the classes for the input examples

        Return
        ------
        self: the fitted model
        """

        # use random instead of np.random to sample random numbers below
        random = check_random_state(random_state)

        estimators = [('lr', LinearRegression())]

        if isinstance(self.sample_size, int):
            self.sample_size = 'reservoir_sampling'

        # add all the trained OLS models to this list
        self.estimators_lr, self.estimators_TSR, self.estimators_enols = [], [], []
        for i in range(self.n_estimators):
            samples = sample_without_replacement(n_population=random.choice([50, 100]),
                                                 n_samples=random.choice([10, 20]),
                                                 random_state=random_state, method=self.sample_size)

            X_train, y_train = [], []
            for i in samples:
                X_train.append(X[i]), y_train.append(y[i])

            reg = LinearRegression()
            reg.fit(np.array(X_train), np.array(y_train))

            tsr = TheilSenRegressor()
            tsr.fit(np.array(X_train), np.array(y_train))

            enol = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
            enol.fit(np.array(X_train), np.array(y_train))

            self.estimators_lr.append(reg), self.estimators_TSR.append(tsr), self.estimators_enols.append(enol)

        return self
 def lesinn(self, x_train, to_query):
     ensemble_size = 50
     subsample_size = int(.01 * x_train.shape[0])
     scores = np.zeros([to_query.shape[0], 1])
     seeds = self.Trainer.rng.randint(MAX_INT, size=ensemble_size)
     for i in range(0, ensemble_size):
         rs = np.random.RandomState(seeds[i])
         sid = sample_without_replacement(n_population=x_train.shape[0],
                                          n_samples=subsample_size,
                                          random_state=rs)
         subsample = x_train[sid]
         kdt = KDTree(subsample, metric='euclidean')
         dists, indices = kdt.query(to_query, k=self.n_neighbors)
         #import pdb; pdb.set_trace()
         dists = np.mean(dists, axis=1)[:, np.newaxis]
         scores += dists
     scores = scores / ensemble_size
     return scores
Beispiel #42
0
    def equalise_laps_with_numlaps_innorew(Imgobj, X, Y, Tasklabel):
        stoplicklap = Imgobj.Parsed_Behavior['lick_stop'].item()
        numlaps_afterlickstops = Imgobj.Parsed_Behavior['numlaps'].item(
        )['Task2'] - stoplicklap
        print('Number of laps being chosen', numlaps_afterlickstops)
        numlaps_currenttask = Imgobj.Parsed_Behavior['numlaps'].item(
        )[Tasklabel] - 3

        samplelaps = sample_without_replacement(numlaps_currenttask,
                                                numlaps_afterlickstops)
        lapframes = \
            [scipy.io.loadmat(os.path.join(Imgobj.FolderName, 'Behavior', p))['E'].T for p in Imgobj.PlaceFieldData if
             Tasklabel in p][0]
        print(samplelaps)
        X_eq = X[np.where(lapframes == samplelaps)[0], :]
        Y_eq = Y[np.where(lapframes == samplelaps)[0]]

        return X_eq, Y_eq
Beispiel #43
0
def _generate_hypercube(samples, dimensions, rng):
    """Returns distinct binary samples of length dimensions
    """
    if not has_sklearn():
        raise RuntimeError("Scikit-learn is needed to run \
                           make_classification.")

    from sklearn.utils.random import sample_without_replacement
    if dimensions > 30:
        return np.hstack([np.random.randint(2, size=(samples,
                                                     dimensions - 30)),
                          _generate_hypercube(samples, 30, rng)])
    random_state = int(rng.randint(dimensions))
    out = sample_without_replacement(2 ** dimensions, samples,
                                     random_state=random_state).astype(
                                         dtype='>u4', copy=False)
    out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:]
    return out
Beispiel #44
0
def _aom_moa_helper(mode, scores, n_buckets, method, bootstrap_estimators,
                    random_state):
    """Internal helper function for Average of Maximum (AOM) and
    Maximum of Average (MOA). See :cite:`aggarwal2015theoretical` for details.

    First dividing estimators into subgroups, take the maximum/average score
    as the subgroup score. Finally, take the average/maximum of all subgroup
    outlier scores.

    Parameters
    ----------
    mode : str
        Define the operation model, either "AOM" or "MOA".

    scores : numpy array of shape (n_samples, n_estimators)
        The score matrix outputted from various estimators.

    n_buckets : int, optional (default=5)
        The number of subgroups to build.

    method : str, optional (default='static')
        {'static', 'dynamic'}, if 'dynamic', build subgroups
        randomly with dynamic bucket size.

    bootstrap_estimators : bool, optional (default=False)
        Whether estimators are drawn with replacement.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the
        random number generator; If RandomState instance, random_state is
        the random number generator; If None, the random number generator
        is the RandomState instance used by `np.random`.

    Returns
    -------
    combined_scores : Numpy array of shape (n_samples,)
        The combined outlier scores.

    """

    if mode != 'AOM' and mode != 'MOA':
        raise NotImplementedError(
            '{mode} is not implemented'.format(mode=mode))

    scores = check_array(scores)
    # TODO: add one more parameter for max number of estimators
    # use random_state instead
    # for now it is fixed at n_estimators/2
    n_estimators = scores.shape[1]
    check_parameter(n_buckets, 2, n_estimators, param_name='n_buckets')

    scores_buckets = np.zeros([scores.shape[0], n_buckets])

    if method == 'static':

        n_estimators_per_bucket = int(n_estimators / n_buckets)
        if n_estimators % n_buckets != 0:
            raise ValueError('n_estimators / n_buckets has a remainder. Not '
                             'allowed in static mode.')

        if not bootstrap_estimators:
            # shuffle the estimator order
            shuffled_list = shuffle(list(range(0, n_estimators, 1)),
                                    random_state=random_state)

            head = 0
            for i in range(0, n_estimators, n_estimators_per_bucket):
                tail = i + n_estimators_per_bucket
                batch_ind = int(i / n_estimators_per_bucket)
                if mode == 'AOM':
                    scores_buckets[:, batch_ind] = np.max(
                        scores[:, shuffled_list[head:tail]], axis=1)
                else:
                    scores_buckets[:, batch_ind] = np.mean(
                        scores[:, shuffled_list[head:tail]], axis=1)

                # increment index
                head = head + n_estimators_per_bucket
                # noinspection PyUnusedLocal
        else:
            for i in range(n_buckets):
                ind = sample_without_replacement(n_estimators,
                                                 n_estimators_per_bucket,
                                                 random_state=random_state)
                if mode == 'AOM':
                    scores_buckets[:, i] = np.max(scores[:, ind], axis=1)
                else:
                    scores_buckets[:, i] = np.mean(scores[:, ind], axis=1)

    elif method == 'dynamic':  # random bucket size
        for i in range(n_buckets):
            # the number of estimators in a bucket should be 2 - n/2
            max_estimator_per_bucket = RandomState(seed=random_state).randint(
                2, int(n_estimators / 2))
            ind = sample_without_replacement(n_estimators,
                                             max_estimator_per_bucket,
                                             random_state=random_state)
            if mode == 'AOM':
                scores_buckets[:, i] = np.max(scores[:, ind], axis=1)
            else:
                scores_buckets[:, i] = np.mean(scores[:, ind], axis=1)

    else:
        raise NotImplementedError(
            '{method} is not implemented'.format(method=method))

    if mode == 'AOM':
        return np.mean(scores_buckets, axis=1)
    else:
        return np.max(scores_buckets, axis=1)
    #   sample(n_population, n_sample)
    #
    sampling_algorithm = {}

    ###########################################################################
    # Set Python core input
    sampling_algorithm["python-core-sample"] = \
        lambda n_population, n_sample: \
            random.sample(xrange(n_population), n_sample)

   ###########################################################################
    # Set custom automatic method selection
    sampling_algorithm["custom-auto"] = \
        lambda n_population, n_samples, random_state=None: \
            sample_without_replacement(n_population,
                                       n_samples,
                                       method="auto",
                                       random_state=random_state)

    ###########################################################################
    # Set custom tracking based method
    sampling_algorithm["custom-tracking-selection"] = \
        lambda n_population, n_samples, random_state=None: \
            sample_without_replacement(n_population,
                                       n_samples,
                                       method="tracking_selection",
                                       random_state=random_state)

    ###########################################################################
    # Set custom reservoir based method
    sampling_algorithm["custom-reservoir-sampling"] = \
        lambda n_population, n_samples, random_state=None: \
def _parallel_build_estimators(n_estimators, ensemble, X, y, cost_mat,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using a mask, and then fit
        if bootstrap:
            indices = random_state.randint(0, n_samples, max_samples)
        else:
            indices = sample_without_replacement(n_samples,
                                                 max_samples,
                                                 random_state=random_state)

        sample_counts = np.bincount(indices, minlength=n_samples)

        estimator.fit((X[indices])[:, features], y[indices], cost_mat[indices, :])
        samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose):
    """Private function used to build a batch of estimators within a job.
    Now it supports queries and querywise sampling.
    It also breaks the PEP8 line length constraint now"""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features
    uQueries = np.unique(Q)

    sample_whole_queries = False
    if hasattr(ensemble, "sample_whole_queries"):
        sample_whole_queries = ensemble.sample_whole_queries

    if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0):
        if sample_whole_queries:
            max_samples = int(max_samples * len(uQueries))
        else:
            max_samples = int(max_samples * n_samples)

    if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap

    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features, max_features, random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                if sample_whole_queries:
                    notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)]
                    notQindices.sort()
                    not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices])
                else:
                    not_indices = sample_without_replacement(
                        n_samples, n_samples - max_samples, random_state=random_state
                    )

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.0

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
            else:
                if sample_whole_queries:
                    Qindices = uQueries[
                        sample_without_replacement(len(uQueries), max_samples, random_state=random_state)
                    ]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = sample_without_replacement(n_samples, max_samples, random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices])
            samples = sample_counts > 0.0

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
Beispiel #48
0
def _parallel_build_estimators(n_estimators, ensemble, all_X, all_y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""

    positives = np.where(all_y == 1)[0]
    unlabeled = np.where(all_y == 0)[0]
    
    X_positives = all_X[positives]
    X_unlabeled = all_X[unlabeled]
    y_positives = all_y[positives]
    y_unlabeled = all_y[unlabeled]

    # Retrieve settings
    n_samples, n_features = X_unlabeled.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    
        #can't currently support sample weights
    if sample_weight is not None:
        raise ValueError("Can't currently support sample weight with PUBagging")

    support_sample_weight = False
    #support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
     #                                         "sample_weight")
    #if not support_sample_weight and sample_weight is not None:
     #   raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(all_X[:, features], all_y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            new_X=np.vstack((X_positives, X_unlabeled[indices]))
            new_y=np.concatenate((y_positives, y_unlabeled[indices]))

            estimator.fit(new_X[:, features], new_y)
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
def sparse_random_matrix(n_components, n_features, density='auto',
                         random_state=None):
    """Generalized Achlioptas random sparse matrix for random projection

    Setting density to 1 / 3 will yield the original matrix by Dimitris
    Achlioptas while setting a lower value will yield the generalization
    by Ping Li et al.

    If we note :math:`s = 1 / density`, the components of the random matrix are
    drawn from:

      - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
      -  0                              with probability 1 - 1 / s
      - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s

    Parameters
    ----------
    n_components : int,
        Dimensionality of the target projection space.

    n_features : int,
        Dimensionality of the original source space.

    density : float in range ]0, 1/3], optional
        Ratio of non-zero component in the random projection matrix.

        By default the value is set to the minimum density as recommended
        by Ping Li et al.: 1 / sqrt(n_features)

        Use density = 1 / 3.0 if you want to reproduce the results from
        Achlioptas, 2001.

    random_state : integer, RandomState instance or None (default)
        Control the pseudo random number generator used to generate the
        matrix at fit time.

    Returns
    -------
    components: numpy array or CSR matrix with shape [n_components, n_features]
        The generated Gaussian random matrix.

    See Also
    --------
    gaussian_random_matrix

    References
    ----------

    .. [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://www.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf

    .. [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    """
    _check_input_size(n_components, n_features)
    density = _check_density(density, n_features)
    rng = check_random_state(random_state)

    if density == 1:
        # skip index generation if totally dense
        components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1
        return 1 / np.sqrt(n_components) * components

    else:
        # Generate location of non zero elements
        indices = []
        offset = 0
        indptr = [offset]
        for i in xrange(n_components):
            # find the indices of the non-zero components for row i
            n_nonzero_i = rng.binomial(n_features, density)
            indices_i = sample_without_replacement(n_features, n_nonzero_i,
                                                   random_state=rng)
            indices.append(indices_i)
            offset += n_nonzero_i
            indptr.append(offset)

        indices = np.concatenate(indices)

        # Among non zero components the probability of the sign is 50%/50%
        data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1

        # build the CSR structure by concatenating the rows
        components = sp.csr_matrix((data, indices, indptr),
                                   shape=(n_components, n_features))

        return np.sqrt(1 / density) / np.sqrt(n_components) * components
Beispiel #50
0
 def sample_without_replacement_method(n_population, n_samples,
                                       random_state=None):
     return sample_without_replacement(n_population, n_samples,
                                       method=m,
                                       random_state=random_state)
Beispiel #51
0
def _spark_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    print "building estimators"
    # Retrieve settings
    X = X.value
    y = y.value
    ensemble = ensemble
    sample_weight = sample_weight.value

    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                              "sample_weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
def _generator_fitted_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose):
    """Private function used to build an iterator of estimators."""
    # Modified from sklearn.ensemble.bagging._parallel_build_estimators

    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0):
        max_samples = int(max_samples * n_samples)

    if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight")

    # Build estimators
    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features, max_features, random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = np.bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(n_samples, n_samples - max_samples, random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.0

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples, max_samples, random_state=random_state)

            sample_counts = np.bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.0

        yield estimator, samples, features